In [1]:
!pip -q install transformers
!pip -q install sentencepiece

In [7]:
import os
import sys
import numpy as np
import pandas as pd
import math
import time
import random
import shutil
import copy
import collections
from pathlib import Path
from contextlib import contextmanager
from collections import defaultdict, Counter
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from tqdm.auto import tqdm
from functools import partial
import torch
import torch.nn as nn
from torch.nn import MarginRankingLoss
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD
import torchvision.models as models
from torch.nn.parameter import Parameter
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingLR
import transformers
from transformers import (AutoModel, AutoTokenizer)
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
import warnings
warnings.filterwarnings("ignore")
from torch.cuda.amp import autocast, GradScaler
import re
from bs4 import BeautifulSoup
tqdm.pandas()

device = ("cuda" if torch.cuda.is_available() else "cpu")


# ====================================================
# CFG
# ====================================================
class CFG:
    ######################
    # Globals #
    ######################
    debug = False
    exp_name = "exp4001"
    fold_seed = 0
    seed = 0
    start_epoch = 0
    epochs = 5
    train = True
    folds = [0,1,2,3,4]
    n_fold = 5
    print_freq = 100
    use_amp = True
    target_col = "pseudo_label"
    ######################
    # Dataset #
    ######################
    head = 64
    tail = 64
    max_length = head+tail
    under_sampling = False
    under_sampling_ratio = 0.1
    ######################
    # Augmentation #
    ######################

    ######################
    # Loaders #
    ######################
    batch_size = 64
    num_workers = 8
    ######################
    # Model #
    ######################
    # https://huggingface.co/deepset/xlm-roberta-base-squad2
    base_model_name = "unitary/multilingual-toxic-xlm-roberta"
    pretrained = True
    num_classes = 1  # Binary 
    hidden_node = 768  # large: 1024, base: 768
    ######################
    # Criterion #
    ######################
    loss_name = "MSELoss"
    loss_params: dict = {}
    ######################
    # Optimizer #
    ######################
    optimizer_name = "AdamW"
    optimizer_params = {
        "lr": 1e-5,
    }
    ######################
    # Scheduler #
    ######################
    scheduler = "cosine"
    num_cycles=0.5
    num_warmup_steps_ratio = 0.1
    

# ====================================================
# Directory settings
# ====================================================
INPUT_PATH = "../input/"
OUTPUT_DIR = f'../output/{CFG.exp_name}/' 
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)


# ====================================================
# Utils
# ====================================================

def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


def init_logger(log_file=OUTPUT_DIR+"train.log"):
    from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger
    
LOGGER = init_logger()

def get_score(more_toxic_preds, less_toxic_preds):
    score = np.mean(more_toxic_preds > less_toxic_preds)
    return score



def get_result(df):
    more_toxic_preds = df["more_toxic_preds"].values
    less_toxic_preds = df["less_toxic_preds"].values
    score = get_score(more_toxic_preds, less_toxic_preds)
    LOGGER.info(f"Score: {score:<.4f}")
    return score

def read_processed_data():
    validation_data = pd.read_csv(INPUT_PATH + "validation_data.csv")
    test = pd.read_csv(INPUT_PATH + "comments_to_score.csv")
    sub = pd.read_csv(INPUT_PATH + "sample_submission.csv")
    train_src = pd.read_csv("../input/jigsaw_1st/PseudoLabelDataset.csv")
    val_comment_unq = pd.concat([validation_data['less_toxic'], validation_data['more_toxic']]).unique()
    train2017 = train_src[~train_src['comment_text'].isin(val_comment_unq)]
    print(train2017.head())
    return train2017, validation_data, test, sub

def text_cleaning(text):
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    template = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+") #Removes e-mail address
    text = template.sub(r'.', text)
    # text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    ipPattern = re.compile('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}') # Removes IP address
    text = ipPattern.sub(r'', text)
    text = text.replace('\n','')
    text = text.strip() # remove spaces at the beginning and at the end of string
    return text

# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))


def prepare_input(text, tokenizer):
    if CFG.tail == 0:
        inputs = tokenizer.encode_plus(
            text, 
            return_tensors=None, 
            add_special_tokens=True, 
            max_length=CFG.max_length,
            pad_to_max_length=True,
            truncation=True
            )
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
    else:
        inputs = tokenizer.encode_plus(
            text,
            return_tensors=None, 
            add_special_tokens=True, 
            truncation=True
            )
        for k, v in inputs.items():
            v_length = len(v)
            if v_length > CFG.max_length:
                v = np.hstack([v[:CFG.head], v[-CFG.tail:]])
            if k == 'input_ids':
                new_v = np.ones(CFG.max_length) * tokenizer.pad_token_id
            else:
                new_v = np.zeros(CFG.max_length)
            new_v[:v_length] = v 
            inputs[k] = torch.tensor(new_v, dtype=torch.long)
    return inputs


# https://zenn.dev/hellorusk/articles/7fd588cae5b173
# huggingface Tokenizer の tokenize, encode, encode_plus などの違い
class TrainDataset(Dataset):
    def __init__(self, df, tokenizer, max_length, is_train=True):
        self.df = df
        self.tokenizer = tokenizer
        self.is_train = is_train
        self.max_length = max_length
        self.text = df["text"].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):   
        text = self.text[idx]
        inputs = prepare_input(str(text), self.tokenizer)
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        if self.is_train:
            label = self.df[CFG.target_col].values[idx]
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'label': torch.tensor(label, dtype=torch.float),
            }
        else:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
            }

# ====================================================
# LOSS
# ====================================================

__CRITERIONS__ = {}

def get_criterion():
    if hasattr(nn, CFG.loss_name):
        return nn.__getattribute__(CFG.loss_name)(**CFG.loss_params)
    elif __CRITERIONS__.get(CFG.loss_name) is not None:
        return __CRITERIONS__[CFG.loss_name](**CFG.loss_params)
    else:
        raise NotImplementedError

# ====================================================
# Train
# ====================================================

# Custom optimizer
__OPTIMIZERS__ = {}


def get_optimizer(model: nn.Module):
    optimizer_name = CFG.optimizer_name
    if __OPTIMIZERS__.get(optimizer_name) is not None:
        return __OPTIMIZERS__[optimizer_name](model.parameters(), **CFG.optimizer_params)
    else:
        return optim.__getattribute__(optimizer_name)(model.parameters(), **CFG.optimizer_params)

def get_scheduler(cfg, optimizer, num_train_steps):
    if cfg.scheduler=='linear':
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
        )
    elif cfg.scheduler=='cosine':
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
        )
    return scheduler

class Model(nn.Module):
    def __init__(self, modelname_or_path):
        super(Model, self).__init__()
        self.base_model = AutoModel.from_pretrained(modelname_or_path)
        self.fc = nn.Linear(CFG.hidden_node, CFG.num_classes)
        self.dropout = nn.Dropout(p=0.)
        # self.ln = nn.LayerNorm(CFG.hidden_node)
        
    def feature(self, input_ids, attention_mask):
        outputs = self.base_model(
            input_ids,
            attention_mask=attention_mask,
            output_hidden_states=False
        )
        last_hidden_states = outputs[0]
        feature = torch.mean(last_hidden_states, 1)
        return feature

    def forward(self, input_ids, attention_mask=None):
        feature = self.feature(input_ids, attention_mask)
        output = self.fc(self.dropout(feature))
        return output




def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler,device):
    if CFG.use_amp:
        scaler = GradScaler()
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()
    # switch to train mode
    model.train()
    start = end = time.time()
    for step, batch_data in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        ids = batch_data['ids'].to(device)
        mask = batch_data['mask'].to(device)
        label = batch_data['label'].to(device)
        batch_size = ids.size(0)
        if CFG.use_amp:
            with autocast():
                outputs = model(input_ids=ids, attention_mask=mask)
                loss = criterion(torch.squeeze(outputs), label)
        else:
            outputs = model(input_ids=ids, attention_mask=mask)
            loss = criterion(torch.squeeze(outputs), label)
        # record loss
        losses.update(loss.item(), batch_size)
        if CFG.use_amp:
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            loss.backward()
            optimizer.step()
        optimizer.zero_grad()
        if scheduler is not None:
            scheduler.step()
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print(
                "Epoch: [{0}][{1}/{2}] "
                "Data {data_time.val:.3f} ({data_time.avg:.3f}) "
                "Elapsed {remain:s} "
                "Loss: {loss.val:.4f}({loss.avg:.4f}) "
                "LR: {lr: 8f}"
                .format(
                    epoch+1, step, len(train_loader), batch_time=batch_time,
                    data_time=data_time, loss=losses,
                    remain=timeSince(start, float(step+1)/len(train_loader)),
                    lr=scheduler.get_lr()[0]
                    )
                )
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    # switch to evaluation mode
    model.eval()
    preds = []
    start = end = time.time()
    for step, batch_data in enumerate(valid_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        ids = batch_data['ids'].to(device)
        mask = batch_data['mask'].to(device)
        batch_size = ids.size(0)
        # compute loss
        with torch.no_grad():
            outputs = model(input_ids=ids, attention_mask=mask)
        preds.append(outputs.to('cpu').numpy()) 
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print(
                "EVAL: [{0}/{1}] "
                "Data {data_time.val:.3f} ({data_time.avg:.3f}) "
                "Elapsed {remain:s} "
                .format(
                    step, len(valid_loader), batch_time=batch_time,
                    data_time=data_time,
                    remain=timeSince(start, float(step+1)/len(valid_loader)),
                    )
                )
    predictions = np.concatenate(preds)
    return predictions


def train_loop(folds, validation):
    validation_last = validation.copy()
    # ====================================================
    # loader
    # ====================================================
    train_folds = folds.reset_index(drop=True)
    
    print("Text cleaning...")
    train_folds['text'] = train_folds['comment_text'].progress_apply(text_cleaning)
    validation['less_toxic'] = validation['less_toxic'].progress_apply(text_cleaning)
    validation['more_toxic'] = validation['more_toxic'].progress_apply(text_cleaning)
    
    print("Train Shape:", train_folds.shape)
    if CFG.under_sampling:
        print("Under Sampling")
        train_folds_0 = train_folds[train_folds["target"]==0]
        train_folds_0 = train_folds_0.sample(frac=CFG.under_sampling_ratio, random_state=CFG.seed)
        train_folds_1 = train_folds[train_folds["target"]>0]
        train_folds = pd.concat([train_folds_1, train_folds_0], axis=0).reset_index(drop=True)
        print("Train Shape (After under sampling):", train_folds.shape)
    
    validation_data = sorted(set(validation['less_toxic'].unique()) | set(validation['more_toxic'].unique()))
    validation_data = pd.DataFrame({'text': validation_data}).reset_index()
    print("Valid Shape:", validation_data.shape)
    
    tokenizer = AutoTokenizer.from_pretrained(CFG.base_model_name)
        
    train_dataset = TrainDataset(train_folds, tokenizer, CFG.max_length, is_train=True)
    valid_dataset = TrainDataset(validation_data, tokenizer, CFG.max_length, is_train=False)

    train_loader = DataLoader(
        train_dataset,
        batch_size=CFG.batch_size, 
        shuffle=True, 
        num_workers=CFG.num_workers, 
        pin_memory=True, 
        drop_last=True
        )
    valid_loader = DataLoader(
        valid_dataset, 
        batch_size=CFG.batch_size * 2, 
        shuffle=False, 
        num_workers=CFG.num_workers, 
        pin_memory=True, 
        drop_last=False
        )
    

    # initialize
    model = Model(CFG.base_model_name)
    model.to(device)
    criterion = get_criterion()

    optimizer = get_optimizer(model)
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    CFG.num_warmup_steps=num_train_steps*CFG.num_warmup_steps_ratio
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)
    # scheduler = get_scheduler(optimizer)
    best_score = 0
    best_loss = np.inf

    for epoch in range(CFG.start_epoch, CFG.start_epoch + CFG.epochs):
        start_time = time.time()
        # train
        avg_loss = train_fn(
            train_loader, 
            model, 
            criterion,
            optimizer, 
            epoch, 
            scheduler,
            device,
            )
        

        # eval
        preds = valid_fn(
            valid_loader,
            model,
            criterion, 
            device
            )

        # scoring
        validation_data["pred"] = preds

        if 'less_toxic_preds' in validation.columns:
            validation = validation.drop(columns='less_toxic_preds')
        if 'more_toxic_preds' in validation.columns:
            validation = validation.drop(columns='more_toxic_preds')
        rename_cols = {"text": 'less_toxic', 'pred': 'less_toxic_preds'}
        validation_less = validation.merge(
            validation_data[["text", "pred"]].rename(columns=rename_cols), 
            on='less_toxic', 
            how='left'
            )
        rename_cols = {"text": 'more_toxic', 'pred': 'more_toxic_preds'}
        validation_more = validation.merge(
            validation_data[["text", "pred"]].rename(columns=rename_cols), 
            on='more_toxic', 
            how='left'
            )

        rename_cols = {"text": 'less_toxic', 'pred': 'less_toxic_preds'}
        validation = validation.merge(
            validation_data[["text", 'pred']].rename(columns=rename_cols), 
            on='less_toxic', 
            how='left'
            )
        rename_cols = {"text": 'more_toxic', 'pred': 'more_toxic_preds'}
        validation = validation.merge(
            validation_data[["text", 'pred']].rename(columns=rename_cols), 
            on='more_toxic', 
            how='left'
            )

        # scoring
        score = get_result(validation)

        elapsed = time.time() - start_time
        LOGGER.info(f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  time: {elapsed:.0f}s")
        LOGGER.info(f"Epoch {epoch+1} - Score: {score:.4f}")

        if score > best_score:
            best_score = score
            LOGGER.info(f"Epoch {epoch+1} - Save : {score:.4f} Model")
            torch.save({"model": model.state_dict()},
                        OUTPUT_DIR+'multilingual-toxic-xlm-roberta_best_score.pth')
            validation_last["more_toxic_preds"] = validation["more_toxic_preds"]
            validation_last["less_toxic_preds"] = validation["less_toxic_preds"]
    return validation_last, validation_less, validation_more


def main():
    seed_torch(seed=CFG.seed)
    train, validation_data, test, sub = read_processed_data()
    if CFG.debug:
        CFG.epochs = 1
        train = train.sample(n=1000, random_state=CFG.seed).reset_index(drop=True)
    oof_df = validation_data.copy()
    if CFG.train:
        # train 
        oof_more_toxic = np.zeros(len(validation_data))
        oof_less_toxic = np.zeros(len(validation_data))
        _oof_df, validation_less, validation_more = train_loop(train, validation_data)
        oof_more_toxic += (_oof_df["more_toxic_preds"].values)
        oof_less_toxic += (_oof_df["less_toxic_preds"].values)
        get_result(_oof_df)
        # CV result
        LOGGER.info(f"========== CV ==========")
        oof_df["more_toxic_preds"] = oof_more_toxic
        oof_df["less_toxic_preds"] = oof_less_toxic
        get_result(oof_df)
        # save result
        validation_less.to_csv(OUTPUT_DIR+"less_df.csv", index=False)
        validation_more.to_csv(OUTPUT_DIR+"more_df.csv", index=False)

In [8]:
if __name__ == "__main__":
    main()

                 id  ... pseudo_label
0  0000997932d777bf  ...    -0.493725
1  000103f0d9cfb60f  ...    -0.636400
2  000113f07ec002fd  ...    -0.496514
3  0001b41b1c6bb37e  ...    -0.254959
4  0001d958c54c6e35  ...    -0.461024

[5 rows x 9 columns]
Text cleaning...


  0%|          | 0/215920 [00:00<?, ?it/s]

  0%|          | 0/30108 [00:00<?, ?it/s]

  0%|          | 0/30108 [00:00<?, ?it/s]

Train Shape: (215920, 10)
Valid Shape: (14237, 2)


Downloading:   0%|          | 0.00/211 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/635 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

Some weights of the model checkpoint at unitary/multilingual-toxic-xlm-roberta were not used when initializing XLMRobertaModel: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at unitary/multilingual-toxic-xlm-roberta and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

Epoch: [1][0/3373] Data 1.486 (1.486) Elapsed 0m 2s (remain 134m 21s) Loss: 0.2020(0.2020) LR:  0.000000
Epoch: [1][100/3373] Data 0.000 (0.015) Elapsed 1m 19s (remain 43m 10s) Loss: 0.1128(0.1540) LR:  0.000001
Epoch: [1][200/3373] Data 0.000 (0.008) Elapsed 2m 37s (remain 41m 28s) Loss: 0.0459(0.1080) LR:  0.000001
Epoch: [1][300/3373] Data 0.000 (0.005) Elapsed 3m 55s (remain 40m 2s) Loss: 0.0292(0.0817) LR:  0.000002
Epoch: [1][400/3373] Data 0.000 (0.004) Elapsed 5m 13s (remain 38m 40s) Loss: 0.0090(0.0658) LR:  0.000002
Epoch: [1][500/3373] Data 0.000 (0.003) Elapsed 6m 30s (remain 37m 20s) Loss: 0.0134(0.0551) LR:  0.000003
Epoch: [1][600/3373] Data 0.000 (0.003) Elapsed 7m 48s (remain 36m 0s) Loss: 0.0085(0.0474) LR:  0.000004
Epoch: [1][700/3373] Data 0.000 (0.002) Elapsed 9m 6s (remain 34m 41s) Loss: 0.0062(0.0416) LR:  0.000004
Epoch: [1][800/3373] Data 0.000 (0.002) Elapsed 10m 23s (remain 33m 23s) Loss: 0.0072(0.0371) LR:  0.000005
Epoch: [1][900/3373] Data 0.000 (0.002) E

Score: 0.7162
Score: 0.7162
Epoch 1 - avg_train_loss: 0.0110  time: 2675s
Epoch 1 - avg_train_loss: 0.0110  time: 2675s
Epoch 1 - Score: 0.7162
Epoch 1 - Score: 0.7162
Epoch 1 - Save : 0.7162 Model
Epoch 1 - Save : 0.7162 Model


Epoch: [2][0/3373] Data 0.643 (0.643) Elapsed 0m 1s (remain 80m 36s) Loss: 0.0043(0.0043) LR:  0.000010
Epoch: [2][100/3373] Data 0.000 (0.007) Elapsed 1m 19s (remain 42m 44s) Loss: 0.0022(0.0021) LR:  0.000010
Epoch: [2][200/3373] Data 0.000 (0.003) Elapsed 2m 36s (remain 41m 15s) Loss: 0.0015(0.0020) LR:  0.000010
Epoch: [2][300/3373] Data 0.000 (0.002) Elapsed 3m 54s (remain 39m 53s) Loss: 0.0016(0.0021) LR:  0.000010
Epoch: [2][400/3373] Data 0.000 (0.002) Elapsed 5m 12s (remain 38m 34s) Loss: 0.0013(0.0021) LR:  0.000010
Epoch: [2][500/3373] Data 0.000 (0.001) Elapsed 6m 29s (remain 37m 15s) Loss: 0.0019(0.0021) LR:  0.000009
Epoch: [2][600/3373] Data 0.000 (0.001) Elapsed 7m 47s (remain 35m 57s) Loss: 0.0014(0.0021) LR:  0.000009
Epoch: [2][700/3373] Data 0.000 (0.001) Elapsed 9m 5s (remain 34m 38s) Loss: 0.0016(0.0021) LR:  0.000009
Epoch: [2][800/3373] Data 0.000 (0.001) Elapsed 10m 23s (remain 33m 20s) Loss: 0.0016(0.0021) LR:  0.000009
Epoch: [2][900/3373] Data 0.000 (0.001) 

Score: 0.7206
Score: 0.7206
Epoch 2 - avg_train_loss: 0.0020  time: 2674s
Epoch 2 - avg_train_loss: 0.0020  time: 2674s
Epoch 2 - Score: 0.7206
Epoch 2 - Score: 0.7206
Epoch 2 - Save : 0.7206 Model
Epoch 2 - Save : 0.7206 Model


Epoch: [3][0/3373] Data 0.502 (0.502) Elapsed 0m 1s (remain 72m 14s) Loss: 0.0015(0.0015) LR:  0.000008
Epoch: [3][100/3373] Data 0.000 (0.005) Elapsed 1m 19s (remain 42m 39s) Loss: 0.0012(0.0018) LR:  0.000007
Epoch: [3][200/3373] Data 0.000 (0.003) Elapsed 2m 36s (remain 41m 13s) Loss: 0.0012(0.0017) LR:  0.000007
Epoch: [3][300/3373] Data 0.000 (0.002) Elapsed 3m 54s (remain 39m 52s) Loss: 0.0010(0.0017) LR:  0.000007
Epoch: [3][400/3373] Data 0.000 (0.001) Elapsed 5m 12s (remain 38m 33s) Loss: 0.0020(0.0018) LR:  0.000007
Epoch: [3][500/3373] Data 0.000 (0.001) Elapsed 6m 29s (remain 37m 15s) Loss: 0.0019(0.0018) LR:  0.000007
Epoch: [3][600/3373] Data 0.000 (0.001) Elapsed 7m 47s (remain 35m 56s) Loss: 0.0010(0.0018) LR:  0.000007
Epoch: [3][700/3373] Data 0.000 (0.001) Elapsed 9m 5s (remain 34m 38s) Loss: 0.0015(0.0018) LR:  0.000007
Epoch: [3][800/3373] Data 0.000 (0.001) Elapsed 10m 23s (remain 33m 20s) Loss: 0.0014(0.0018) LR:  0.000007
Epoch: [3][900/3373] Data 0.000 (0.001) 

Score: 0.7214
Score: 0.7214
Epoch 3 - avg_train_loss: 0.0017  time: 2675s
Epoch 3 - avg_train_loss: 0.0017  time: 2675s
Epoch 3 - Score: 0.7214
Epoch 3 - Score: 0.7214
Epoch 3 - Save : 0.7214 Model
Epoch 3 - Save : 0.7214 Model


Epoch: [4][0/3373] Data 0.595 (0.595) Elapsed 0m 1s (remain 77m 53s) Loss: 0.0012(0.0012) LR:  0.000004
Epoch: [4][100/3373] Data 0.000 (0.006) Elapsed 1m 19s (remain 42m 43s) Loss: 0.0008(0.0016) LR:  0.000004
Epoch: [4][200/3373] Data 0.000 (0.003) Elapsed 2m 36s (remain 41m 15s) Loss: 0.0020(0.0016) LR:  0.000004
Epoch: [4][300/3373] Data 0.000 (0.002) Elapsed 3m 54s (remain 39m 54s) Loss: 0.0013(0.0016) LR:  0.000004
Epoch: [4][400/3373] Data 0.000 (0.002) Elapsed 5m 12s (remain 38m 34s) Loss: 0.0019(0.0015) LR:  0.000004
Epoch: [4][500/3373] Data 0.000 (0.001) Elapsed 6m 30s (remain 37m 15s) Loss: 0.0012(0.0015) LR:  0.000004
Epoch: [4][600/3373] Data 0.000 (0.001) Elapsed 7m 47s (remain 35m 57s) Loss: 0.0014(0.0015) LR:  0.000004
Epoch: [4][700/3373] Data 0.000 (0.001) Elapsed 9m 5s (remain 34m 39s) Loss: 0.0013(0.0015) LR:  0.000003
Epoch: [4][800/3373] Data 0.000 (0.001) Elapsed 10m 23s (remain 33m 21s) Loss: 0.0012(0.0015) LR:  0.000003
Epoch: [4][900/3373] Data 0.000 (0.001) 

Score: 0.7215
Score: 0.7215
Epoch 4 - avg_train_loss: 0.0015  time: 2675s
Epoch 4 - avg_train_loss: 0.0015  time: 2675s
Epoch 4 - Score: 0.7215
Epoch 4 - Score: 0.7215
Epoch 4 - Save : 0.7215 Model
Epoch 4 - Save : 0.7215 Model


Epoch: [5][0/3373] Data 0.690 (0.690) Elapsed 0m 1s (remain 83m 27s) Loss: 0.0011(0.0011) LR:  0.000001
Epoch: [5][100/3373] Data 0.000 (0.007) Elapsed 1m 19s (remain 42m 46s) Loss: 0.0011(0.0015) LR:  0.000001
Epoch: [5][200/3373] Data 0.000 (0.004) Elapsed 2m 36s (remain 41m 16s) Loss: 0.0007(0.0014) LR:  0.000001
Epoch: [5][300/3373] Data 0.000 (0.002) Elapsed 3m 54s (remain 39m 55s) Loss: 0.0014(0.0014) LR:  0.000001
Epoch: [5][400/3373] Data 0.000 (0.002) Elapsed 5m 12s (remain 38m 35s) Loss: 0.0012(0.0014) LR:  0.000001
Epoch: [5][500/3373] Data 0.000 (0.002) Elapsed 6m 30s (remain 37m 16s) Loss: 0.0011(0.0014) LR:  0.000001
Epoch: [5][600/3373] Data 0.000 (0.001) Elapsed 7m 47s (remain 35m 57s) Loss: 0.0023(0.0014) LR:  0.000001
Epoch: [5][700/3373] Data 0.000 (0.001) Elapsed 9m 5s (remain 34m 39s) Loss: 0.0009(0.0014) LR:  0.000001
Epoch: [5][800/3373] Data 0.000 (0.001) Elapsed 10m 23s (remain 33m 21s) Loss: 0.0010(0.0014) LR:  0.000001
Epoch: [5][900/3373] Data 0.000 (0.001) 

Score: 0.7219
Score: 0.7219
Epoch 5 - avg_train_loss: 0.0014  time: 2675s
Epoch 5 - avg_train_loss: 0.0014  time: 2675s
Epoch 5 - Score: 0.7219
Epoch 5 - Score: 0.7219
Epoch 5 - Save : 0.7219 Model
Epoch 5 - Save : 0.7219 Model
Score: 0.7219
Score: 0.7219
Score: 0.7219
Score: 0.7219


In [9]:
import os
import sys
import numpy as np
import pandas as pd
import math
import time
import random
import shutil
import copy
import collections
from pathlib import Path
from contextlib import contextmanager
from collections import defaultdict, Counter
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from tqdm.auto import tqdm
from functools import partial
import torch
import torch.nn as nn
from torch.nn import MarginRankingLoss
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD
import torchvision.models as models
from torch.nn.parameter import Parameter
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingLR
import transformers
from transformers import (AutoModel, AutoTokenizer)
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
import warnings
warnings.filterwarnings("ignore")
from torch.cuda.amp import autocast, GradScaler
import re
from bs4 import BeautifulSoup
tqdm.pandas()

device = ("cuda" if torch.cuda.is_available() else "cpu")


# ====================================================
# CFG
# ====================================================
class CFG:
    ######################
    # Globals #
    ######################
    debug = False
    exp_name = "exp4001_seed1"
    fold_seed = 0
    seed = 1
    start_epoch = 0
    epochs = 5
    train = True
    folds = [0,1,2,3,4]
    n_fold = 5
    print_freq = 100
    use_amp = True
    target_col = "pseudo_label"
    ######################
    # Dataset #
    ######################
    head = 64
    tail = 64
    max_length = head+tail
    under_sampling = False
    under_sampling_ratio = 0.1
    ######################
    # Augmentation #
    ######################

    ######################
    # Loaders #
    ######################
    batch_size = 64
    num_workers = 8
    ######################
    # Model #
    ######################
    # https://huggingface.co/deepset/xlm-roberta-base-squad2
    base_model_name = "unitary/multilingual-toxic-xlm-roberta"
    pretrained = True
    num_classes = 1  # Binary 
    hidden_node = 768  # large: 1024, base: 768
    ######################
    # Criterion #
    ######################
    loss_name = "MSELoss"
    loss_params: dict = {}
    ######################
    # Optimizer #
    ######################
    optimizer_name = "AdamW"
    optimizer_params = {
        "lr": 1e-5,
    }
    ######################
    # Scheduler #
    ######################
    scheduler = "cosine"
    num_cycles=0.5
    num_warmup_steps_ratio = 0.1
    

# ====================================================
# Directory settings
# ====================================================
INPUT_PATH = "../input/"
OUTPUT_DIR = f'../output/{CFG.exp_name}/' 
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)


# ====================================================
# Utils
# ====================================================

def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


def init_logger(log_file=OUTPUT_DIR+"train.log"):
    from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger
    
LOGGER = init_logger()

def get_score(more_toxic_preds, less_toxic_preds):
    score = np.mean(more_toxic_preds > less_toxic_preds)
    return score



def get_result(df):
    more_toxic_preds = df["more_toxic_preds"].values
    less_toxic_preds = df["less_toxic_preds"].values
    score = get_score(more_toxic_preds, less_toxic_preds)
    LOGGER.info(f"Score: {score:<.4f}")
    return score


def read_processed_data():
    validation_data = pd.read_csv(INPUT_PATH + "validation_data.csv")
    test = pd.read_csv(INPUT_PATH + "comments_to_score.csv")
    sub = pd.read_csv(INPUT_PATH + "sample_submission.csv")
    train_src = pd.read_csv("../input/jigsaw_1st/PseudoLabelDataset.csv")
    val_comment_unq = pd.concat([validation_data['less_toxic'], validation_data['more_toxic']]).unique()
    train2017 = train_src[~train_src['comment_text'].isin(val_comment_unq)]
    print(train2017.head())
    return train2017, validation_data, test, sub


def text_cleaning(text):
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    template = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+") #Removes e-mail address
    text = template.sub(r'.', text)
    # text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    ipPattern = re.compile('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}') # Removes IP address
    text = ipPattern.sub(r'', text)
    text = text.replace('\n','')
    text = text.strip() # remove spaces at the beginning and at the end of string
    return text

# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))


def prepare_input(text, tokenizer):
    if CFG.tail == 0:
        inputs = tokenizer.encode_plus(
            text, 
            return_tensors=None, 
            add_special_tokens=True, 
            max_length=CFG.max_length,
            pad_to_max_length=True,
            truncation=True
            )
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
    else:
        inputs = tokenizer.encode_plus(
            text,
            return_tensors=None, 
            add_special_tokens=True, 
            truncation=True
            )
        for k, v in inputs.items():
            v_length = len(v)
            if v_length > CFG.max_length:
                v = np.hstack([v[:CFG.head], v[-CFG.tail:]])
            if k == 'input_ids':
                new_v = np.ones(CFG.max_length) * tokenizer.pad_token_id
            else:
                new_v = np.zeros(CFG.max_length)
            new_v[:v_length] = v 
            inputs[k] = torch.tensor(new_v, dtype=torch.long)
    return inputs


# https://zenn.dev/hellorusk/articles/7fd588cae5b173
# huggingface Tokenizer の tokenize, encode, encode_plus などの違い
class TrainDataset(Dataset):
    def __init__(self, df, tokenizer, max_length, is_train=True):
        self.df = df
        self.tokenizer = tokenizer
        self.is_train = is_train
        self.max_length = max_length
        self.text = df["text"].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):   
        text = self.text[idx]
        inputs = prepare_input(str(text), self.tokenizer)
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        if self.is_train:
            label = self.df[CFG.target_col].values[idx]
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'label': torch.tensor(label, dtype=torch.float),
            }
        else:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
            }

# ====================================================
# LOSS
# ====================================================

__CRITERIONS__ = {}

def get_criterion():
    if hasattr(nn, CFG.loss_name):
        return nn.__getattribute__(CFG.loss_name)(**CFG.loss_params)
    elif __CRITERIONS__.get(CFG.loss_name) is not None:
        return __CRITERIONS__[CFG.loss_name](**CFG.loss_params)
    else:
        raise NotImplementedError

# ====================================================
# Train
# ====================================================

# Custom optimizer
__OPTIMIZERS__ = {}


def get_optimizer(model: nn.Module):
    optimizer_name = CFG.optimizer_name
    if __OPTIMIZERS__.get(optimizer_name) is not None:
        return __OPTIMIZERS__[optimizer_name](model.parameters(), **CFG.optimizer_params)
    else:
        return optim.__getattribute__(optimizer_name)(model.parameters(), **CFG.optimizer_params)

def get_scheduler(cfg, optimizer, num_train_steps):
    if cfg.scheduler=='linear':
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
        )
    elif cfg.scheduler=='cosine':
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
        )
    return scheduler

class Model(nn.Module):
    def __init__(self, modelname_or_path):
        super(Model, self).__init__()
        self.base_model = AutoModel.from_pretrained(modelname_or_path)
        self.fc = nn.Linear(CFG.hidden_node, CFG.num_classes)
        self.dropout = nn.Dropout(p=0.)
        # self.ln = nn.LayerNorm(CFG.hidden_node)
        
    def feature(self, input_ids, attention_mask):
        outputs = self.base_model(
            input_ids,
            attention_mask=attention_mask,
            output_hidden_states=False
        )
        last_hidden_states = outputs[0]
        feature = torch.mean(last_hidden_states, 1)
        return feature

    def forward(self, input_ids, attention_mask=None):
        feature = self.feature(input_ids, attention_mask)
        output = self.fc(self.dropout(feature))
        return output




def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler,device):
    if CFG.use_amp:
        scaler = GradScaler()
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()
    # switch to train mode
    model.train()
    start = end = time.time()
    for step, batch_data in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        ids = batch_data['ids'].to(device)
        mask = batch_data['mask'].to(device)
        label = batch_data['label'].to(device)
        batch_size = ids.size(0)
        if CFG.use_amp:
            with autocast():
                outputs = model(input_ids=ids, attention_mask=mask)
                loss = criterion(torch.squeeze(outputs), label)
        else:
            outputs = model(input_ids=ids, attention_mask=mask)
            loss = criterion(torch.squeeze(outputs), label)
        # record loss
        losses.update(loss.item(), batch_size)
        if CFG.use_amp:
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            loss.backward()
            optimizer.step()
        optimizer.zero_grad()
        if scheduler is not None:
            scheduler.step()
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print(
                "Epoch: [{0}][{1}/{2}] "
                "Data {data_time.val:.3f} ({data_time.avg:.3f}) "
                "Elapsed {remain:s} "
                "Loss: {loss.val:.4f}({loss.avg:.4f}) "
                "LR: {lr: 8f}"
                .format(
                    epoch+1, step, len(train_loader), batch_time=batch_time,
                    data_time=data_time, loss=losses,
                    remain=timeSince(start, float(step+1)/len(train_loader)),
                    lr=scheduler.get_lr()[0]
                    )
                )
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    # switch to evaluation mode
    model.eval()
    preds = []
    start = end = time.time()
    for step, batch_data in enumerate(valid_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        ids = batch_data['ids'].to(device)
        mask = batch_data['mask'].to(device)
        batch_size = ids.size(0)
        # compute loss
        with torch.no_grad():
            outputs = model(input_ids=ids, attention_mask=mask)
        preds.append(outputs.to('cpu').numpy()) 
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print(
                "EVAL: [{0}/{1}] "
                "Data {data_time.val:.3f} ({data_time.avg:.3f}) "
                "Elapsed {remain:s} "
                .format(
                    step, len(valid_loader), batch_time=batch_time,
                    data_time=data_time,
                    remain=timeSince(start, float(step+1)/len(valid_loader)),
                    )
                )
    predictions = np.concatenate(preds)
    return predictions


def train_loop(folds, validation):
    validation_last = validation.copy()
    # ====================================================
    # loader
    # ====================================================
    train_folds = folds.reset_index(drop=True)
    
    print("Text cleaning...")
    train_folds['text'] = train_folds['comment_text'].progress_apply(text_cleaning)
    validation['less_toxic'] = validation['less_toxic'].progress_apply(text_cleaning)
    validation['more_toxic'] = validation['more_toxic'].progress_apply(text_cleaning)
    
    print("Train Shape:", train_folds.shape)
    if CFG.under_sampling:
        print("Under Sampling")
        train_folds_0 = train_folds[train_folds["target"]==0]
        train_folds_0 = train_folds_0.sample(frac=CFG.under_sampling_ratio, random_state=CFG.seed)
        train_folds_1 = train_folds[train_folds["target"]>0]
        train_folds = pd.concat([train_folds_1, train_folds_0], axis=0).reset_index(drop=True)
        print("Train Shape (After under sampling):", train_folds.shape)
    
    validation_data = sorted(set(validation['less_toxic'].unique()) | set(validation['more_toxic'].unique()))
    validation_data = pd.DataFrame({'text': validation_data}).reset_index()
    print("Valid Shape:", validation_data.shape)
    
    tokenizer = AutoTokenizer.from_pretrained(CFG.base_model_name)
        
    train_dataset = TrainDataset(train_folds, tokenizer, CFG.max_length, is_train=True)
    valid_dataset = TrainDataset(validation_data, tokenizer, CFG.max_length, is_train=False)

    train_loader = DataLoader(
        train_dataset,
        batch_size=CFG.batch_size, 
        shuffle=True, 
        num_workers=CFG.num_workers, 
        pin_memory=True, 
        drop_last=True
        )
    valid_loader = DataLoader(
        valid_dataset, 
        batch_size=CFG.batch_size * 2, 
        shuffle=False, 
        num_workers=CFG.num_workers, 
        pin_memory=True, 
        drop_last=False
        )
    

    # initialize
    model = Model(CFG.base_model_name)
    model.to(device)
    criterion = get_criterion()

    optimizer = get_optimizer(model)
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    CFG.num_warmup_steps=num_train_steps*CFG.num_warmup_steps_ratio
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)
    # scheduler = get_scheduler(optimizer)
    best_score = 0
    best_loss = np.inf

    for epoch in range(CFG.start_epoch, CFG.start_epoch + CFG.epochs):
        start_time = time.time()
        # train
        avg_loss = train_fn(
            train_loader, 
            model, 
            criterion,
            optimizer, 
            epoch, 
            scheduler,
            device,
            )
        

        # eval
        preds = valid_fn(
            valid_loader,
            model,
            criterion, 
            device
            )

        # scoring
        validation_data["pred"] = preds

        if 'less_toxic_preds' in validation.columns:
            validation = validation.drop(columns='less_toxic_preds')
        if 'more_toxic_preds' in validation.columns:
            validation = validation.drop(columns='more_toxic_preds')
        rename_cols = {"text": 'less_toxic', 'pred': 'less_toxic_preds'}
        validation_less = validation.merge(
            validation_data[["text", "pred"]].rename(columns=rename_cols), 
            on='less_toxic', 
            how='left'
            )
        rename_cols = {"text": 'more_toxic', 'pred': 'more_toxic_preds'}
        validation_more = validation.merge(
            validation_data[["text", "pred"]].rename(columns=rename_cols), 
            on='more_toxic', 
            how='left'
            )

        rename_cols = {"text": 'less_toxic', 'pred': 'less_toxic_preds'}
        validation = validation.merge(
            validation_data[["text", 'pred']].rename(columns=rename_cols), 
            on='less_toxic', 
            how='left'
            )
        rename_cols = {"text": 'more_toxic', 'pred': 'more_toxic_preds'}
        validation = validation.merge(
            validation_data[["text", 'pred']].rename(columns=rename_cols), 
            on='more_toxic', 
            how='left'
            )

        # scoring
        score = get_result(validation)

        elapsed = time.time() - start_time
        LOGGER.info(f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  time: {elapsed:.0f}s")
        LOGGER.info(f"Epoch {epoch+1} - Score: {score:.4f}")

        if score > best_score:
            best_score = score
            LOGGER.info(f"Epoch {epoch+1} - Save : {score:.4f} Model")
            torch.save({"model": model.state_dict()},
                        OUTPUT_DIR+'multilingual-toxic-xlm-roberta_best_score.pth')
            validation_last["more_toxic_preds"] = validation["more_toxic_preds"]
            validation_last["less_toxic_preds"] = validation["less_toxic_preds"]
    return validation_last, validation_less, validation_more


def main():
    seed_torch(seed=CFG.seed)
    train, validation_data, test, sub = read_processed_data()
    if CFG.debug:
        CFG.epochs = 1
        train = train.sample(n=1000, random_state=CFG.seed).reset_index(drop=True)
    oof_df = validation_data.copy()
    if CFG.train:
        # train 
        oof_more_toxic = np.zeros(len(validation_data))
        oof_less_toxic = np.zeros(len(validation_data))
        _oof_df, validation_less, validation_more = train_loop(train, validation_data)
        oof_more_toxic += (_oof_df["more_toxic_preds"].values)
        oof_less_toxic += (_oof_df["less_toxic_preds"].values)
        get_result(_oof_df)
        # CV result
        LOGGER.info(f"========== CV ==========")
        oof_df["more_toxic_preds"] = oof_more_toxic
        oof_df["less_toxic_preds"] = oof_less_toxic
        get_result(oof_df)
        # save result
        validation_less.to_csv(OUTPUT_DIR+"less_df.csv", index=False)
        validation_more.to_csv(OUTPUT_DIR+"more_df.csv", index=False)

In [10]:
if __name__ == "__main__":
    main()

                 id  ... pseudo_label
0  0000997932d777bf  ...    -0.493725
1  000103f0d9cfb60f  ...    -0.636400
2  000113f07ec002fd  ...    -0.496514
3  0001b41b1c6bb37e  ...    -0.254959
4  0001d958c54c6e35  ...    -0.461024

[5 rows x 9 columns]
Text cleaning...


  0%|          | 0/215920 [00:00<?, ?it/s]

  0%|          | 0/30108 [00:00<?, ?it/s]

  0%|          | 0/30108 [00:00<?, ?it/s]

Train Shape: (215920, 10)
Valid Shape: (14237, 2)


Some weights of the model checkpoint at unitary/multilingual-toxic-xlm-roberta were not used when initializing XLMRobertaModel: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at unitary/multilingual-toxic-xlm-roberta and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

Epoch: [1][0/3373] Data 0.614 (0.614) Elapsed 0m 1s (remain 79m 1s) Loss: 0.0638(0.0638) LR:  0.000000
Epoch: [1][100/3373] Data 0.000 (0.006) Elapsed 1m 19s (remain 42m 41s) Loss: 0.0307(0.0462) LR:  0.000001
Epoch: [1][200/3373] Data 0.000 (0.003) Elapsed 2m 36s (remain 41m 14s) Loss: 0.0208(0.0363) LR:  0.000001
Epoch: [1][300/3373] Data 0.000 (0.002) Elapsed 3m 54s (remain 39m 52s) Loss: 0.0083(0.0291) LR:  0.000002
Epoch: [1][400/3373] Data 0.000 (0.002) Elapsed 5m 12s (remain 38m 33s) Loss: 0.0142(0.0239) LR:  0.000002
Epoch: [1][500/3373] Data 0.000 (0.001) Elapsed 6m 29s (remain 37m 15s) Loss: 0.0111(0.0203) LR:  0.000003
Epoch: [1][600/3373] Data 0.000 (0.001) Elapsed 7m 47s (remain 35m 56s) Loss: 0.0045(0.0178) LR:  0.000004
Epoch: [1][700/3373] Data 0.000 (0.001) Elapsed 9m 5s (remain 34m 38s) Loss: 0.0035(0.0158) LR:  0.000004
Epoch: [1][800/3373] Data 0.000 (0.001) Elapsed 10m 23s (remain 33m 20s) Loss: 0.0019(0.0143) LR:  0.000005
Epoch: [1][900/3373] Data 0.000 (0.001) E

Score: 0.7189
Score: 0.7189
Score: 0.7189
Epoch 1 - avg_train_loss: 0.0053  time: 2674s
Epoch 1 - avg_train_loss: 0.0053  time: 2674s
Epoch 1 - avg_train_loss: 0.0053  time: 2674s
Epoch 1 - Score: 0.7189
Epoch 1 - Score: 0.7189
Epoch 1 - Score: 0.7189
Epoch 1 - Save : 0.7189 Model
Epoch 1 - Save : 0.7189 Model
Epoch 1 - Save : 0.7189 Model


Epoch: [2][0/3373] Data 0.583 (0.583) Elapsed 0m 1s (remain 77m 26s) Loss: 0.0014(0.0014) LR:  0.000010
Epoch: [2][100/3373] Data 0.000 (0.006) Elapsed 1m 19s (remain 42m 42s) Loss: 0.0032(0.0021) LR:  0.000010
Epoch: [2][200/3373] Data 0.000 (0.003) Elapsed 2m 36s (remain 41m 14s) Loss: 0.0017(0.0022) LR:  0.000010
Epoch: [2][300/3373] Data 0.000 (0.002) Elapsed 3m 54s (remain 39m 53s) Loss: 0.0034(0.0021) LR:  0.000010
Epoch: [2][400/3373] Data 0.000 (0.002) Elapsed 5m 12s (remain 38m 34s) Loss: 0.0016(0.0021) LR:  0.000010
Epoch: [2][500/3373] Data 0.000 (0.001) Elapsed 6m 29s (remain 37m 15s) Loss: 0.0017(0.0021) LR:  0.000009
Epoch: [2][600/3373] Data 0.000 (0.001) Elapsed 7m 47s (remain 35m 57s) Loss: 0.0047(0.0021) LR:  0.000009
Epoch: [2][700/3373] Data 0.000 (0.001) Elapsed 9m 5s (remain 34m 39s) Loss: 0.0026(0.0021) LR:  0.000009
Epoch: [2][800/3373] Data 0.000 (0.001) Elapsed 10m 23s (remain 33m 20s) Loss: 0.0016(0.0021) LR:  0.000009
Epoch: [2][900/3373] Data 0.000 (0.001) 

Score: 0.7201
Score: 0.7201
Score: 0.7201
Epoch 2 - avg_train_loss: 0.0020  time: 2675s
Epoch 2 - avg_train_loss: 0.0020  time: 2675s
Epoch 2 - avg_train_loss: 0.0020  time: 2675s
Epoch 2 - Score: 0.7201
Epoch 2 - Score: 0.7201
Epoch 2 - Score: 0.7201
Epoch 2 - Save : 0.7201 Model
Epoch 2 - Save : 0.7201 Model
Epoch 2 - Save : 0.7201 Model


Epoch: [3][0/3373] Data 0.529 (0.529) Elapsed 0m 1s (remain 73m 57s) Loss: 0.0029(0.0029) LR:  0.000008
Epoch: [3][100/3373] Data 0.000 (0.005) Elapsed 1m 19s (remain 42m 40s) Loss: 0.0013(0.0016) LR:  0.000007
Epoch: [3][200/3373] Data 0.000 (0.003) Elapsed 2m 36s (remain 41m 14s) Loss: 0.0013(0.0017) LR:  0.000007
Epoch: [3][300/3373] Data 0.000 (0.002) Elapsed 3m 54s (remain 39m 53s) Loss: 0.0026(0.0016) LR:  0.000007
Epoch: [3][400/3373] Data 0.000 (0.001) Elapsed 5m 12s (remain 38m 33s) Loss: 0.0010(0.0017) LR:  0.000007
Epoch: [3][500/3373] Data 0.000 (0.001) Elapsed 6m 29s (remain 37m 15s) Loss: 0.0010(0.0017) LR:  0.000007
Epoch: [3][600/3373] Data 0.000 (0.001) Elapsed 7m 47s (remain 35m 56s) Loss: 0.0006(0.0017) LR:  0.000007
Epoch: [3][700/3373] Data 0.000 (0.001) Elapsed 9m 5s (remain 34m 38s) Loss: 0.0018(0.0017) LR:  0.000007
Epoch: [3][800/3373] Data 0.000 (0.001) Elapsed 10m 23s (remain 33m 20s) Loss: 0.0012(0.0017) LR:  0.000007
Epoch: [3][900/3373] Data 0.000 (0.001) 

Score: 0.7216
Score: 0.7216
Score: 0.7216
Epoch 3 - avg_train_loss: 0.0016  time: 2674s
Epoch 3 - avg_train_loss: 0.0016  time: 2674s
Epoch 3 - avg_train_loss: 0.0016  time: 2674s
Epoch 3 - Score: 0.7216
Epoch 3 - Score: 0.7216
Epoch 3 - Score: 0.7216
Epoch 3 - Save : 0.7216 Model
Epoch 3 - Save : 0.7216 Model
Epoch 3 - Save : 0.7216 Model


Epoch: [4][0/3373] Data 0.631 (0.631) Elapsed 0m 1s (remain 79m 41s) Loss: 0.0011(0.0011) LR:  0.000004
Epoch: [4][100/3373] Data 0.000 (0.006) Elapsed 1m 19s (remain 42m 44s) Loss: 0.0011(0.0014) LR:  0.000004
Epoch: [4][200/3373] Data 0.000 (0.003) Elapsed 2m 36s (remain 41m 15s) Loss: 0.0006(0.0015) LR:  0.000004
Epoch: [4][300/3373] Data 0.000 (0.002) Elapsed 3m 54s (remain 39m 54s) Loss: 0.0010(0.0015) LR:  0.000004
Epoch: [4][400/3373] Data 0.000 (0.002) Elapsed 5m 12s (remain 38m 34s) Loss: 0.0024(0.0015) LR:  0.000004
Epoch: [4][500/3373] Data 0.000 (0.001) Elapsed 6m 30s (remain 37m 16s) Loss: 0.0012(0.0014) LR:  0.000004
Epoch: [4][600/3373] Data 0.000 (0.001) Elapsed 7m 47s (remain 35m 57s) Loss: 0.0013(0.0015) LR:  0.000004
Epoch: [4][700/3373] Data 0.000 (0.001) Elapsed 9m 5s (remain 34m 39s) Loss: 0.0017(0.0014) LR:  0.000003
Epoch: [4][800/3373] Data 0.000 (0.001) Elapsed 10m 23s (remain 33m 21s) Loss: 0.0011(0.0014) LR:  0.000003
Epoch: [4][900/3373] Data 0.000 (0.001) 

Score: 0.7217
Score: 0.7217
Score: 0.7217
Epoch 4 - avg_train_loss: 0.0014  time: 2675s
Epoch 4 - avg_train_loss: 0.0014  time: 2675s
Epoch 4 - avg_train_loss: 0.0014  time: 2675s
Epoch 4 - Score: 0.7217
Epoch 4 - Score: 0.7217
Epoch 4 - Score: 0.7217
Epoch 4 - Save : 0.7217 Model
Epoch 4 - Save : 0.7217 Model
Epoch 4 - Save : 0.7217 Model


Epoch: [5][0/3373] Data 0.661 (0.661) Elapsed 0m 1s (remain 80m 55s) Loss: 0.0021(0.0021) LR:  0.000001
Epoch: [5][100/3373] Data 0.000 (0.007) Elapsed 1m 19s (remain 42m 44s) Loss: 0.0011(0.0013) LR:  0.000001
Epoch: [5][200/3373] Data 0.000 (0.003) Elapsed 2m 36s (remain 41m 15s) Loss: 0.0012(0.0014) LR:  0.000001
Epoch: [5][300/3373] Data 0.000 (0.002) Elapsed 3m 54s (remain 39m 54s) Loss: 0.0008(0.0013) LR:  0.000001
Epoch: [5][400/3373] Data 0.000 (0.002) Elapsed 5m 12s (remain 38m 35s) Loss: 0.0013(0.0013) LR:  0.000001
Epoch: [5][500/3373] Data 0.000 (0.001) Elapsed 6m 30s (remain 37m 16s) Loss: 0.0009(0.0013) LR:  0.000001
Epoch: [5][600/3373] Data 0.000 (0.001) Elapsed 7m 47s (remain 35m 57s) Loss: 0.0023(0.0013) LR:  0.000001
Epoch: [5][700/3373] Data 0.000 (0.001) Elapsed 9m 5s (remain 34m 39s) Loss: 0.0006(0.0013) LR:  0.000001
Epoch: [5][800/3373] Data 0.000 (0.001) Elapsed 10m 23s (remain 33m 21s) Loss: 0.0010(0.0013) LR:  0.000001
Epoch: [5][900/3373] Data 0.000 (0.001) 

Score: 0.7226
Score: 0.7226
Score: 0.7226
Epoch 5 - avg_train_loss: 0.0013  time: 2674s
Epoch 5 - avg_train_loss: 0.0013  time: 2674s
Epoch 5 - avg_train_loss: 0.0013  time: 2674s
Epoch 5 - Score: 0.7226
Epoch 5 - Score: 0.7226
Epoch 5 - Score: 0.7226
Epoch 5 - Save : 0.7226 Model
Epoch 5 - Save : 0.7226 Model
Epoch 5 - Save : 0.7226 Model
Score: 0.7226
Score: 0.7226
Score: 0.7226
Score: 0.7226
Score: 0.7226
Score: 0.7226


In [2]:
import os
import sys
import numpy as np
import pandas as pd
import math
import time
import random
import shutil
import copy
import collections
from pathlib import Path
from contextlib import contextmanager
from collections import defaultdict, Counter
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from tqdm.auto import tqdm
from functools import partial
import torch
import torch.nn as nn
from torch.nn import MarginRankingLoss
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD
import torchvision.models as models
from torch.nn.parameter import Parameter
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingLR
import transformers
from transformers import (AutoModel, AutoTokenizer)
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
import warnings
warnings.filterwarnings("ignore")
from torch.cuda.amp import autocast, GradScaler
import re
from bs4 import BeautifulSoup
tqdm.pandas()

device = ("cuda" if torch.cuda.is_available() else "cpu")


# ====================================================
# CFG
# ====================================================
class CFG:
    ######################
    # Globals #
    ######################
    debug = False
    exp_name = "exp4001_seed2"
    fold_seed = 0
    seed = 2
    start_epoch = 0
    epochs = 5
    train = True
    folds = [0,1,2,3,4]
    n_fold = 5
    print_freq = 100
    use_amp = True
    target_col = "pseudo_label"
    ######################
    # Dataset #
    ######################
    head = 64
    tail = 64
    max_length = head+tail
    under_sampling = False
    under_sampling_ratio = 0.1
    ######################
    # Augmentation #
    ######################

    ######################
    # Loaders #
    ######################
    batch_size = 64
    num_workers = 8
    ######################
    # Model #
    ######################
    # https://huggingface.co/deepset/xlm-roberta-base-squad2
    base_model_name = "unitary/multilingual-toxic-xlm-roberta"
    pretrained = True
    num_classes = 1  # Binary 
    hidden_node = 768  # large: 1024, base: 768
    ######################
    # Criterion #
    ######################
    loss_name = "MSELoss"
    loss_params: dict = {}
    ######################
    # Optimizer #
    ######################
    optimizer_name = "AdamW"
    optimizer_params = {
        "lr": 1e-5,
    }
    ######################
    # Scheduler #
    ######################
    scheduler = "cosine"
    num_cycles=0.5
    num_warmup_steps_ratio = 0.1
    

# ====================================================
# Directory settings
# ====================================================
INPUT_PATH = "../input/"
OUTPUT_DIR = f'../output/{CFG.exp_name}/' 
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)


# ====================================================
# Utils
# ====================================================

def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


def init_logger(log_file=OUTPUT_DIR+"train.log"):
    from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger
    
LOGGER = init_logger()

def get_score(more_toxic_preds, less_toxic_preds):
    score = np.mean(more_toxic_preds > less_toxic_preds)
    return score



def get_result(df):
    more_toxic_preds = df["more_toxic_preds"].values
    less_toxic_preds = df["less_toxic_preds"].values
    score = get_score(more_toxic_preds, less_toxic_preds)
    LOGGER.info(f"Score: {score:<.4f}")
    return score


def read_processed_data():
    validation_data = pd.read_csv(INPUT_PATH + "validation_data.csv")
    test = pd.read_csv(INPUT_PATH + "comments_to_score.csv")
    sub = pd.read_csv(INPUT_PATH + "sample_submission.csv")
    train_src = pd.read_csv("../input/jigsaw_1st/PseudoLabelDataset.csv")
    val_comment_unq = pd.concat([validation_data['less_toxic'], validation_data['more_toxic']]).unique()
    train2017 = train_src[~train_src['comment_text'].isin(val_comment_unq)]
    print(train2017.head())
    return train2017, validation_data, test, sub


def text_cleaning(text):
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    template = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+") #Removes e-mail address
    text = template.sub(r'.', text)
    # text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    ipPattern = re.compile('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}') # Removes IP address
    text = ipPattern.sub(r'', text)
    text = text.replace('\n','')
    text = text.strip() # remove spaces at the beginning and at the end of string
    return text

# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))


def prepare_input(text, tokenizer):
    if CFG.tail == 0:
        inputs = tokenizer.encode_plus(
            text, 
            return_tensors=None, 
            add_special_tokens=True, 
            max_length=CFG.max_length,
            pad_to_max_length=True,
            truncation=True
            )
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
    else:
        inputs = tokenizer.encode_plus(
            text,
            return_tensors=None, 
            add_special_tokens=True, 
            truncation=True
            )
        for k, v in inputs.items():
            v_length = len(v)
            if v_length > CFG.max_length:
                v = np.hstack([v[:CFG.head], v[-CFG.tail:]])
            if k == 'input_ids':
                new_v = np.ones(CFG.max_length) * tokenizer.pad_token_id
            else:
                new_v = np.zeros(CFG.max_length)
            new_v[:v_length] = v 
            inputs[k] = torch.tensor(new_v, dtype=torch.long)
    return inputs


# https://zenn.dev/hellorusk/articles/7fd588cae5b173
# huggingface Tokenizer の tokenize, encode, encode_plus などの違い
class TrainDataset(Dataset):
    def __init__(self, df, tokenizer, max_length, is_train=True):
        self.df = df
        self.tokenizer = tokenizer
        self.is_train = is_train
        self.max_length = max_length
        self.text = df["text"].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):   
        text = self.text[idx]
        inputs = prepare_input(str(text), self.tokenizer)
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        if self.is_train:
            label = self.df[CFG.target_col].values[idx]
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'label': torch.tensor(label, dtype=torch.float),
            }
        else:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
            }

# ====================================================
# LOSS
# ====================================================

__CRITERIONS__ = {}

def get_criterion():
    if hasattr(nn, CFG.loss_name):
        return nn.__getattribute__(CFG.loss_name)(**CFG.loss_params)
    elif __CRITERIONS__.get(CFG.loss_name) is not None:
        return __CRITERIONS__[CFG.loss_name](**CFG.loss_params)
    else:
        raise NotImplementedError

# ====================================================
# Train
# ====================================================

# Custom optimizer
__OPTIMIZERS__ = {}


def get_optimizer(model: nn.Module):
    optimizer_name = CFG.optimizer_name
    if __OPTIMIZERS__.get(optimizer_name) is not None:
        return __OPTIMIZERS__[optimizer_name](model.parameters(), **CFG.optimizer_params)
    else:
        return optim.__getattribute__(optimizer_name)(model.parameters(), **CFG.optimizer_params)

def get_scheduler(cfg, optimizer, num_train_steps):
    if cfg.scheduler=='linear':
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
        )
    elif cfg.scheduler=='cosine':
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
        )
    return scheduler

class Model(nn.Module):
    def __init__(self, modelname_or_path):
        super(Model, self).__init__()
        self.base_model = AutoModel.from_pretrained(modelname_or_path)
        self.fc = nn.Linear(CFG.hidden_node, CFG.num_classes)
        self.dropout = nn.Dropout(p=0.)
        # self.ln = nn.LayerNorm(CFG.hidden_node)
        
    def feature(self, input_ids, attention_mask):
        outputs = self.base_model(
            input_ids,
            attention_mask=attention_mask,
            output_hidden_states=False
        )
        last_hidden_states = outputs[0]
        feature = torch.mean(last_hidden_states, 1)
        return feature

    def forward(self, input_ids, attention_mask=None):
        feature = self.feature(input_ids, attention_mask)
        output = self.fc(self.dropout(feature))
        return output




def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler,device):
    if CFG.use_amp:
        scaler = GradScaler()
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()
    # switch to train mode
    model.train()
    start = end = time.time()
    for step, batch_data in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        ids = batch_data['ids'].to(device)
        mask = batch_data['mask'].to(device)
        label = batch_data['label'].to(device)
        batch_size = ids.size(0)
        if CFG.use_amp:
            with autocast():
                outputs = model(input_ids=ids, attention_mask=mask)
                loss = criterion(torch.squeeze(outputs), label)
        else:
            outputs = model(input_ids=ids, attention_mask=mask)
            loss = criterion(torch.squeeze(outputs), label)
        # record loss
        losses.update(loss.item(), batch_size)
        if CFG.use_amp:
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            loss.backward()
            optimizer.step()
        optimizer.zero_grad()
        if scheduler is not None:
            scheduler.step()
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print(
                "Epoch: [{0}][{1}/{2}] "
                "Data {data_time.val:.3f} ({data_time.avg:.3f}) "
                "Elapsed {remain:s} "
                "Loss: {loss.val:.4f}({loss.avg:.4f}) "
                "LR: {lr: 8f}"
                .format(
                    epoch+1, step, len(train_loader), batch_time=batch_time,
                    data_time=data_time, loss=losses,
                    remain=timeSince(start, float(step+1)/len(train_loader)),
                    lr=scheduler.get_lr()[0]
                    )
                )
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    # switch to evaluation mode
    model.eval()
    preds = []
    start = end = time.time()
    for step, batch_data in enumerate(valid_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        ids = batch_data['ids'].to(device)
        mask = batch_data['mask'].to(device)
        batch_size = ids.size(0)
        # compute loss
        with torch.no_grad():
            outputs = model(input_ids=ids, attention_mask=mask)
        preds.append(outputs.to('cpu').numpy()) 
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print(
                "EVAL: [{0}/{1}] "
                "Data {data_time.val:.3f} ({data_time.avg:.3f}) "
                "Elapsed {remain:s} "
                .format(
                    step, len(valid_loader), batch_time=batch_time,
                    data_time=data_time,
                    remain=timeSince(start, float(step+1)/len(valid_loader)),
                    )
                )
    predictions = np.concatenate(preds)
    return predictions


def train_loop(folds, validation):
    validation_last = validation.copy()
    # ====================================================
    # loader
    # ====================================================
    train_folds = folds.reset_index(drop=True)
    
    print("Text cleaning...")
    train_folds['text'] = train_folds['comment_text'].progress_apply(text_cleaning)
    validation['less_toxic'] = validation['less_toxic'].progress_apply(text_cleaning)
    validation['more_toxic'] = validation['more_toxic'].progress_apply(text_cleaning)
    
    print("Train Shape:", train_folds.shape)
    if CFG.under_sampling:
        print("Under Sampling")
        train_folds_0 = train_folds[train_folds["target"]==0]
        train_folds_0 = train_folds_0.sample(frac=CFG.under_sampling_ratio, random_state=CFG.seed)
        train_folds_1 = train_folds[train_folds["target"]>0]
        train_folds = pd.concat([train_folds_1, train_folds_0], axis=0).reset_index(drop=True)
        print("Train Shape (After under sampling):", train_folds.shape)
    
    validation_data = sorted(set(validation['less_toxic'].unique()) | set(validation['more_toxic'].unique()))
    validation_data = pd.DataFrame({'text': validation_data}).reset_index()
    print("Valid Shape:", validation_data.shape)
    
    tokenizer = AutoTokenizer.from_pretrained(CFG.base_model_name)
        
    train_dataset = TrainDataset(train_folds, tokenizer, CFG.max_length, is_train=True)
    valid_dataset = TrainDataset(validation_data, tokenizer, CFG.max_length, is_train=False)

    train_loader = DataLoader(
        train_dataset,
        batch_size=CFG.batch_size, 
        shuffle=True, 
        num_workers=CFG.num_workers, 
        pin_memory=True, 
        drop_last=True
        )
    valid_loader = DataLoader(
        valid_dataset, 
        batch_size=CFG.batch_size * 2, 
        shuffle=False, 
        num_workers=CFG.num_workers, 
        pin_memory=True, 
        drop_last=False
        )
    

    # initialize
    model = Model(CFG.base_model_name)
    model.to(device)
    criterion = get_criterion()

    optimizer = get_optimizer(model)
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    CFG.num_warmup_steps=num_train_steps*CFG.num_warmup_steps_ratio
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)
    # scheduler = get_scheduler(optimizer)
    best_score = 0
    best_loss = np.inf

    for epoch in range(CFG.start_epoch, CFG.start_epoch + CFG.epochs):
        start_time = time.time()
        # train
        avg_loss = train_fn(
            train_loader, 
            model, 
            criterion,
            optimizer, 
            epoch, 
            scheduler,
            device,
            )
        

        # eval
        preds = valid_fn(
            valid_loader,
            model,
            criterion, 
            device
            )

        # scoring
        validation_data["pred"] = preds

        if 'less_toxic_preds' in validation.columns:
            validation = validation.drop(columns='less_toxic_preds')
        if 'more_toxic_preds' in validation.columns:
            validation = validation.drop(columns='more_toxic_preds')

        rename_cols = {"text": 'less_toxic', 'pred': 'less_toxic_preds'}
        validation = validation.merge(
            validation_data[["text", 'pred']].rename(columns=rename_cols), 
            on='less_toxic', 
            how='left'
            )
        rename_cols = {"text": 'more_toxic', 'pred': 'more_toxic_preds'}
        validation = validation.merge(
            validation_data[["text", 'pred']].rename(columns=rename_cols), 
            on='more_toxic', 
            how='left'
            )

        # scoring
        score = get_result(validation)

        elapsed = time.time() - start_time
        LOGGER.info(f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  time: {elapsed:.0f}s")
        LOGGER.info(f"Epoch {epoch+1} - Score: {score:.4f}")

        if score > best_score:
            best_score = score
            LOGGER.info(f"Epoch {epoch+1} - Save : {score:.4f} Model")
            torch.save({"model": model.state_dict()},
                        OUTPUT_DIR+'multilingual-toxic-xlm-roberta_best_score.pth')
            validation_last["more_toxic_preds"] = validation["more_toxic_preds"]
            validation_last["less_toxic_preds"] = validation["less_toxic_preds"]
            validation_less = validation_last.drop("more_toxic_preds", axis=1)
            validation_more = validation_last.drop("less_toxic_preds", axis=1)
    return validation_last, validation_less, validation_more


def main():
    seed_torch(seed=CFG.seed)
    train, validation_data, test, sub = read_processed_data()
    if CFG.debug:
        CFG.epochs = 1
        train = train.sample(n=1000, random_state=CFG.seed).reset_index(drop=True)
    oof_df = validation_data.copy()
    if CFG.train:
        # train 
        oof_more_toxic = np.zeros(len(validation_data))
        oof_less_toxic = np.zeros(len(validation_data))
        _oof_df, validation_less, validation_more = train_loop(train, validation_data)
        oof_more_toxic += (_oof_df["more_toxic_preds"].values)
        oof_less_toxic += (_oof_df["less_toxic_preds"].values)
        get_result(_oof_df)
        # CV result
        LOGGER.info(f"========== CV ==========")
        oof_df["more_toxic_preds"] = oof_more_toxic
        oof_df["less_toxic_preds"] = oof_less_toxic
        get_result(oof_df)
        # save result
        validation_less.to_csv(OUTPUT_DIR+"less_df.csv", index=False)
        validation_more.to_csv(OUTPUT_DIR+"more_df.csv", index=False)

In [3]:
if __name__ == "__main__":
    main()

                 id  ... pseudo_label
0  0000997932d777bf  ...    -0.493725
1  000103f0d9cfb60f  ...    -0.636400
2  000113f07ec002fd  ...    -0.496514
3  0001b41b1c6bb37e  ...    -0.254959
4  0001d958c54c6e35  ...    -0.461024

[5 rows x 9 columns]
Text cleaning...


  0%|          | 0/215920 [00:00<?, ?it/s]

  0%|          | 0/30108 [00:00<?, ?it/s]

  0%|          | 0/30108 [00:00<?, ?it/s]

Train Shape: (215920, 10)
Valid Shape: (14237, 2)


Some weights of the model checkpoint at unitary/multilingual-toxic-xlm-roberta were not used when initializing XLMRobertaModel: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at unitary/multilingual-toxic-xlm-roberta and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

Epoch: [1][0/3373] Data 0.701 (0.701) Elapsed 0m 1s (remain 81m 57s) Loss: 0.1445(0.1445) LR:  0.000000
Epoch: [1][100/3373] Data 0.000 (0.007) Elapsed 1m 19s (remain 42m 44s) Loss: 0.1054(0.1248) LR:  0.000001
Epoch: [1][200/3373] Data 0.000 (0.004) Elapsed 2m 36s (remain 41m 16s) Loss: 0.0421(0.0946) LR:  0.000001
Epoch: [1][300/3373] Data 0.000 (0.003) Elapsed 3m 54s (remain 39m 55s) Loss: 0.0259(0.0726) LR:  0.000002
Epoch: [1][400/3373] Data 0.000 (0.002) Elapsed 5m 12s (remain 38m 36s) Loss: 0.0082(0.0582) LR:  0.000002
Epoch: [1][500/3373] Data 0.000 (0.002) Elapsed 6m 30s (remain 37m 17s) Loss: 0.0069(0.0485) LR:  0.000003
Epoch: [1][600/3373] Data 0.000 (0.001) Elapsed 7m 48s (remain 35m 59s) Loss: 0.0125(0.0416) LR:  0.000004
Epoch: [1][700/3373] Data 0.000 (0.001) Elapsed 9m 5s (remain 34m 40s) Loss: 0.0044(0.0364) LR:  0.000004
Epoch: [1][800/3373] Data 0.000 (0.001) Elapsed 10m 23s (remain 33m 22s) Loss: 0.0033(0.0325) LR:  0.000005
Epoch: [1][900/3373] Data 0.000 (0.001) 

Score: 0.7180
Epoch 1 - avg_train_loss: 0.0098  time: 2677s
Epoch 1 - Score: 0.7180
Epoch 1 - Save : 0.7180 Model


Epoch: [2][0/3373] Data 0.602 (0.602) Elapsed 0m 1s (remain 78m 46s) Loss: 0.0014(0.0014) LR:  0.000010
Epoch: [2][100/3373] Data 0.000 (0.006) Elapsed 1m 19s (remain 42m 46s) Loss: 0.0013(0.0024) LR:  0.000010
Epoch: [2][200/3373] Data 0.000 (0.003) Elapsed 2m 37s (remain 41m 17s) Loss: 0.0014(0.0022) LR:  0.000010
Epoch: [2][300/3373] Data 0.000 (0.002) Elapsed 3m 54s (remain 39m 56s) Loss: 0.0027(0.0022) LR:  0.000010
Epoch: [2][400/3373] Data 0.000 (0.002) Elapsed 5m 12s (remain 38m 37s) Loss: 0.0021(0.0022) LR:  0.000010
Epoch: [2][500/3373] Data 0.000 (0.001) Elapsed 6m 30s (remain 37m 18s) Loss: 0.0009(0.0022) LR:  0.000009
Epoch: [2][600/3373] Data 0.000 (0.001) Elapsed 7m 48s (remain 35m 59s) Loss: 0.0018(0.0022) LR:  0.000009
Epoch: [2][700/3373] Data 0.000 (0.001) Elapsed 9m 6s (remain 34m 41s) Loss: 0.0030(0.0022) LR:  0.000009
Epoch: [2][800/3373] Data 0.000 (0.001) Elapsed 10m 23s (remain 33m 23s) Loss: 0.0012(0.0022) LR:  0.000009
Epoch: [2][900/3373] Data 0.000 (0.001) 

Score: 0.7207
Epoch 2 - avg_train_loss: 0.0021  time: 2679s
Epoch 2 - Score: 0.7207
Epoch 2 - Save : 0.7207 Model


Epoch: [3][0/3373] Data 0.649 (0.649) Elapsed 0m 1s (remain 80m 32s) Loss: 0.0011(0.0011) LR:  0.000008
Epoch: [3][100/3373] Data 0.000 (0.007) Elapsed 1m 19s (remain 42m 48s) Loss: 0.0017(0.0017) LR:  0.000007
Epoch: [3][200/3373] Data 0.000 (0.003) Elapsed 2m 37s (remain 41m 19s) Loss: 0.0013(0.0017) LR:  0.000007
Epoch: [3][300/3373] Data 0.000 (0.002) Elapsed 3m 54s (remain 39m 58s) Loss: 0.0015(0.0017) LR:  0.000007
Epoch: [3][400/3373] Data 0.000 (0.002) Elapsed 5m 12s (remain 38m 38s) Loss: 0.0022(0.0017) LR:  0.000007
Epoch: [3][500/3373] Data 0.000 (0.001) Elapsed 6m 30s (remain 37m 19s) Loss: 0.0010(0.0017) LR:  0.000007
Epoch: [3][600/3373] Data 0.000 (0.001) Elapsed 7m 48s (remain 36m 0s) Loss: 0.0017(0.0017) LR:  0.000007
Epoch: [3][700/3373] Data 0.000 (0.001) Elapsed 9m 6s (remain 34m 42s) Loss: 0.0016(0.0018) LR:  0.000007
Epoch: [3][800/3373] Data 0.000 (0.001) Elapsed 10m 24s (remain 33m 24s) Loss: 0.0013(0.0018) LR:  0.000007
Epoch: [3][900/3373] Data 0.000 (0.001) E

Score: 0.7223
Epoch 3 - avg_train_loss: 0.0017  time: 2679s
Epoch 3 - Score: 0.7223
Epoch 3 - Save : 0.7223 Model


Epoch: [4][0/3373] Data 0.682 (0.682) Elapsed 0m 1s (remain 83m 3s) Loss: 0.0012(0.0012) LR:  0.000004
Epoch: [4][100/3373] Data 0.000 (0.007) Elapsed 1m 19s (remain 42m 50s) Loss: 0.0009(0.0015) LR:  0.000004
Epoch: [4][200/3373] Data 0.000 (0.004) Elapsed 2m 37s (remain 41m 20s) Loss: 0.0011(0.0016) LR:  0.000004
Epoch: [4][300/3373] Data 0.000 (0.002) Elapsed 3m 55s (remain 39m 58s) Loss: 0.0008(0.0016) LR:  0.000004
Epoch: [4][400/3373] Data 0.000 (0.002) Elapsed 5m 12s (remain 38m 39s) Loss: 0.0012(0.0016) LR:  0.000004
Epoch: [4][500/3373] Data 0.000 (0.002) Elapsed 6m 30s (remain 37m 19s) Loss: 0.0007(0.0015) LR:  0.000004
Epoch: [4][600/3373] Data 0.000 (0.001) Elapsed 7m 48s (remain 36m 1s) Loss: 0.0010(0.0015) LR:  0.000004
Epoch: [4][700/3373] Data 0.000 (0.001) Elapsed 9m 6s (remain 34m 42s) Loss: 0.0009(0.0015) LR:  0.000003
Epoch: [4][800/3373] Data 0.000 (0.001) Elapsed 10m 24s (remain 33m 24s) Loss: 0.0041(0.0015) LR:  0.000003
Epoch: [4][900/3373] Data 0.000 (0.001) El

Score: 0.7233
Epoch 4 - avg_train_loss: 0.0015  time: 2679s
Epoch 4 - Score: 0.7233
Epoch 4 - Save : 0.7233 Model


Epoch: [5][0/3373] Data 0.692 (0.692) Elapsed 0m 1s (remain 83m 22s) Loss: 0.0007(0.0007) LR:  0.000001
Epoch: [5][100/3373] Data 0.000 (0.007) Elapsed 1m 19s (remain 42m 50s) Loss: 0.0020(0.0013) LR:  0.000001
Epoch: [5][200/3373] Data 0.000 (0.004) Elapsed 2m 37s (remain 41m 20s) Loss: 0.0008(0.0013) LR:  0.000001
Epoch: [5][300/3373] Data 0.000 (0.003) Elapsed 3m 55s (remain 39m 58s) Loss: 0.0016(0.0013) LR:  0.000001
Epoch: [5][400/3373] Data 0.000 (0.002) Elapsed 5m 12s (remain 38m 38s) Loss: 0.0007(0.0013) LR:  0.000001
Epoch: [5][500/3373] Data 0.000 (0.002) Elapsed 6m 30s (remain 37m 19s) Loss: 0.0014(0.0013) LR:  0.000001
Epoch: [5][600/3373] Data 0.000 (0.001) Elapsed 7m 48s (remain 36m 0s) Loss: 0.0025(0.0014) LR:  0.000001
Epoch: [5][700/3373] Data 0.000 (0.001) Elapsed 9m 6s (remain 34m 42s) Loss: 0.0006(0.0013) LR:  0.000001
Epoch: [5][800/3373] Data 0.000 (0.001) Elapsed 10m 24s (remain 33m 24s) Loss: 0.0010(0.0014) LR:  0.000001
Epoch: [5][900/3373] Data 0.000 (0.001) E

Score: 0.7231
Epoch 5 - avg_train_loss: 0.0014  time: 2678s
Epoch 5 - Score: 0.7231
Score: 0.7233
Score: 0.7233
