In [None]:
!pip -q install transformers==4.5.1

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import math
import time
import random
import shutil
import copy
import collections
from pathlib import Path
from contextlib import contextmanager
from collections import defaultdict, Counter
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from tqdm.auto import tqdm
from functools import partial
import torch
import torch.nn as nn
from torch.nn import MarginRankingLoss
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD
import torchvision.models as models
from torch.nn.parameter import Parameter
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingLR
import transformers
from transformers import (AutoModel, AutoTokenizer)
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
import warnings
warnings.filterwarnings("ignore")
from torch.cuda.amp import autocast, GradScaler
import re
from bs4 import BeautifulSoup
tqdm.pandas()

device = ("cuda" if torch.cuda.is_available() else "cpu")


# ====================================================
# CFG
# ====================================================
class CFG:
    ######################
    # Globals #
    ######################
    debug = False
    exp_name = "exp1004"
    fold_seed = 0
    seed = 0
    start_epoch = 0
    epochs = 3
    train = True
    folds = [0,1,2,3,4]
    n_fold = 5
    print_freq = 100
    use_amp = True
    toxic_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    ######################
    # Dataset #
    ######################
    head = 64
    tail = 64
    max_length = head+tail
    under_sampling = True
    under_sampling_ratio = 0.1
    ######################
    # Augmentation #
    ######################

    ######################
    # Loaders #
    ######################
    batch_size = 32
    num_workers = 8
    ######################
    # Model #
    ######################
    # https://huggingface.co/deepset/xlm-roberta-base-squad2
    base_model_name = "roberta-large"
    pretrained = True
    num_classes = len(toxic_cols)  # Binary 
    hidden_node = 1024  # large: 1024, base: 768
    ######################
    # Criterion #
    ######################
    loss_name = "BCEWithLogitsLoss"
    loss_params: dict = {}
    ######################
    # Optimizer #
    ######################
    optimizer_name = "AdamW"
    optimizer_params = {
        "lr": 1e-5,
    }
    ######################
    # Scheduler #
    ######################
    scheduler = "cosine"
    num_cycles=0.5
    num_warmup_steps_ratio = 0.1
    

# ====================================================
# Directory settings
# ====================================================
INPUT_PATH = "../input/"
OUTPUT_DIR = f'../output/{CFG.exp_name}/' 
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)


# ====================================================
# Utils
# ====================================================

def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


def init_logger(log_file=OUTPUT_DIR+"train.log"):
    from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger
    
LOGGER = init_logger()

def get_score(more_toxic_preds, less_toxic_preds):
    score = np.mean(more_toxic_preds > less_toxic_preds)
    return score



def get_result(df):
    more_toxic_preds = df["more_toxic_preds"].values
    less_toxic_preds = df["less_toxic_preds"].values
    score = get_score(more_toxic_preds, less_toxic_preds)
    LOGGER.info(f"Score: {score:<.4f}")
    return score


def data2017_preprocess(train_1st, test_1st, test_labels_1st, val_data):
    
    test_1st_l = pd.merge(test_1st, test_labels_1st, on='id', how='left').query("toxic != -1")
    train_src = pd.concat([train_1st, test_1st_l], axis='rows')
    # judge whether toxic or not in target col
    train_src['target'] = train_src[CFG.toxic_cols].values.max(axis=1)
    val_comment_unq = pd.concat([val_data['less_toxic'], val_data['more_toxic']]).unique()
    train = train_src[~train_src['comment_text'].isin(val_comment_unq)]
    print('Sampling: Train')
    print(f'shape: {train.shape}')
    return train

def read_data():
    validation_data = pd.read_csv(INPUT_PATH + "validation_data.csv")
    test = pd.read_csv(INPUT_PATH + "comments_to_score.csv")
    sub = pd.read_csv(INPUT_PATH + "sample_submission.csv")
    return validation_data, test, sub

def read_processed_data():
    validation_data = pd.read_csv(INPUT_PATH + "validation_data.csv")
    test = pd.read_csv(INPUT_PATH + "comments_to_score.csv")
    sub = pd.read_csv(INPUT_PATH + "sample_submission.csv")
    train2017 = pd.read_csv("../input/jigsaw_1st/train.csv")
    test2017 = pd.read_csv('../input/jigsaw_1st/test.csv')
    test_labels2017 = pd.read_csv('../input/jigsaw_1st/test_labels.csv')
    train2017 = data2017_preprocess(train2017, test2017, test_labels2017, validation_data)
    print(train2017.head())
    return train2017, validation_data, test, sub


def text_cleaning(text):
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    template = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+") #Removes e-mail address
    text = template.sub(r'.', text)
    # text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    ipPattern = re.compile('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}') # Removes IP address
    text = ipPattern.sub(r'', text)
    bikkuri = re.compile('!') # Removes bikkuri
    text = bikkuri.sub(r' ', text)
    text = text.replace('\n','')
    text = text.replace("\'","")
    text = text.replace("|","")
    text = text.replace("=","")
    text = text.replace("F**K", "FUCK")
    text = text.replace("F__K", "FUCK")
    text = text.replace("f**k", "fuck")
    text = text.replace("f__k", "fuck")
    text = text.replace("f*ck", "fuck")    
    text = text.replace("S$X", "SEX")
    text = text.replace("s$x", "sex")
    text = text.replace(" u ", " you ")
    text = text.replace(" u ", " you ")
    text = text.replace(" U ", " you ")
    text = text.replace(" U ", " you ")
    text = text.replace("YOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOUUUUUUUUUUUUUUUUUUUU", "YOU")
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string
    return text

# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))


def prepare_input(text, tokenizer):
    if CFG.tail == 0:
        inputs = tokenizer.encode_plus(
            text, 
            return_tensors=None, 
            add_special_tokens=True, 
            max_length=CFG.max_length,
            pad_to_max_length=True,
            truncation=True
            )
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
    else:
        inputs = tokenizer.encode_plus(
            text,
            return_tensors=None, 
            add_special_tokens=True, 
            truncation=True
            )
        for k, v in inputs.items():
            v_length = len(v)
            if v_length > CFG.max_length:
                v = np.hstack([v[:CFG.head], v[-CFG.tail:]])
            if k == 'input_ids':
                new_v = np.ones(CFG.max_length) * tokenizer.pad_token_id
            else:
                new_v = np.zeros(CFG.max_length)
            new_v[:v_length] = v 
            inputs[k] = torch.tensor(new_v, dtype=torch.long)
    return inputs


# https://zenn.dev/hellorusk/articles/7fd588cae5b173
# huggingface Tokenizer の tokenize, encode, encode_plus などの違い
class TrainDataset(Dataset):
    def __init__(self, df, tokenizer, max_length, is_train=True):
        self.df = df
        self.tokenizer = tokenizer
        self.is_train = is_train
        self.max_length = max_length
        self.text = df["text"].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):   
        text = self.text[idx]
        inputs = prepare_input(str(text), self.tokenizer)
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        if self.is_train:
            label = self.df[CFG.toxic_cols].values[idx]
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'label': torch.tensor(label, dtype=torch.float),
            }
        else:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
            }

# ====================================================
# LOSS
# ====================================================

__CRITERIONS__ = {}

def get_criterion():
    if hasattr(nn, CFG.loss_name):
        return nn.__getattribute__(CFG.loss_name)(**CFG.loss_params)
    elif __CRITERIONS__.get(CFG.loss_name) is not None:
        return __CRITERIONS__[CFG.loss_name](**CFG.loss_params)
    else:
        raise NotImplementedError

# ====================================================
# Train
# ====================================================

# Custom optimizer
__OPTIMIZERS__ = {}


def get_optimizer(model: nn.Module):
    optimizer_name = CFG.optimizer_name
    if __OPTIMIZERS__.get(optimizer_name) is not None:
        return __OPTIMIZERS__[optimizer_name](model.parameters(), **CFG.optimizer_params)
    else:
        return optim.__getattribute__(optimizer_name)(model.parameters(), **CFG.optimizer_params)


# def get_scheduler(optimizer):
#     scheduler_name = CFG.scheduler_name

#     if scheduler_name is None:
#         return
#     else:
#         return optim.lr_scheduler.__getattribute__(scheduler_name)(optimizer, **CFG.scheduler_params)

def get_scheduler(cfg, optimizer, num_train_steps):
    if cfg.scheduler=='linear':
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
        )
    elif cfg.scheduler=='cosine':
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
        )
    return scheduler

class Model(nn.Module):
    def __init__(self, modelname_or_path):
        super(Model, self).__init__()
        self.base_model = AutoModel.from_pretrained(modelname_or_path)
        self.fc = nn.Linear(CFG.hidden_node, CFG.num_classes)
        self.dropout = nn.Dropout(p=0.5)
        # self.ln = nn.LayerNorm(CFG.hidden_node)
        
    def feature(self, input_ids, attention_mask):
        outputs = self.base_model(
            input_ids,
            attention_mask=attention_mask,
            output_hidden_states=False
        )
        last_hidden_states = outputs[0]
        feature = torch.mean(last_hidden_states, 1)
        return feature

    def forward(self, input_ids, attention_mask=None):
        feature = self.feature(input_ids, attention_mask)
        output = self.fc(self.dropout(feature))
        return output




def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler,device):
    if CFG.use_amp:
        scaler = GradScaler()
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()
    # switch to train mode
    model.train()
    start = end = time.time()
    for step, batch_data in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        ids = batch_data['ids'].to(device)
        mask = batch_data['mask'].to(device)
        label = batch_data['label'].to(device)
        batch_size = ids.size(0)
        if CFG.use_amp:
            with autocast():
                outputs = model(input_ids=ids, attention_mask=mask)
                loss = criterion(outputs, label)
        else:
            outputs = model(input_ids=ids, attention_mask=mask)
            loss = criterion(outputs, label)
        # record loss
        losses.update(loss.item(), batch_size)
        if CFG.use_amp:
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            loss.backward()
            optimizer.step()
        optimizer.zero_grad()
        if scheduler is not None:
            scheduler.step()
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print(
                "Epoch: [{0}][{1}/{2}] "
                "Data {data_time.val:.3f} ({data_time.avg:.3f}) "
                "Elapsed {remain:s} "
                "Loss: {loss.val:.4f}({loss.avg:.4f}) "
                "LR: {lr: 8f}"
                .format(
                    epoch+1, step, len(train_loader), batch_time=batch_time,
                    data_time=data_time, loss=losses,
                    remain=timeSince(start, float(step+1)/len(train_loader)),
                    lr=scheduler.get_lr()[0]
                    )
                )
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    # switch to evaluation mode
    model.eval()
    preds = []
    start = end = time.time()
    for step, batch_data in enumerate(valid_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        ids = batch_data['ids'].to(device)
        mask = batch_data['mask'].to(device)
        batch_size = ids.size(0)
        # compute loss
        with torch.no_grad():
            outputs = model(input_ids=ids, attention_mask=mask)
        preds.append(outputs.to('cpu').numpy()) 
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print(
                "EVAL: [{0}/{1}] "
                "Data {data_time.val:.3f} ({data_time.avg:.3f}) "
                "Elapsed {remain:s} "
                .format(
                    step, len(valid_loader), batch_time=batch_time,
                    data_time=data_time,
                    remain=timeSince(start, float(step+1)/len(valid_loader)),
                    )
                )
    predictions = np.concatenate(preds)
    return predictions


def train_loop(folds, validation):
    validation_last = validation.copy()
    # ====================================================
    # loader
    # ====================================================
    train_folds = folds.reset_index(drop=True)
    
    print("Text cleaning...")
    train_folds['text'] = train_folds['comment_text'].progress_apply(text_cleaning)
    validation['less_toxic'] = validation['less_toxic'].progress_apply(text_cleaning)
    validation['more_toxic'] = validation['more_toxic'].progress_apply(text_cleaning)
    
    print("Train Shape:", train_folds.shape)
    if CFG.under_sampling:
        print("Under Sampling")
        train_folds_0 = train_folds[train_folds["target"]==0]
        train_folds_0 = train_folds_0.sample(frac=CFG.under_sampling_ratio, random_state=CFG.seed)
        train_folds_1 = train_folds[train_folds["target"]>0]
        train_folds = pd.concat([train_folds_1, train_folds_0], axis=0).reset_index(drop=True)
    
    validation_data = sorted(set(validation['less_toxic'].unique()) | set(validation['more_toxic'].unique()))
    validation_data = pd.DataFrame({'text': validation_data}).reset_index()
    print("Valid Shape:", validation_data.shape)
    
    tokenizer = AutoTokenizer.from_pretrained(CFG.base_model_name)
        
    train_dataset = TrainDataset(train_folds, tokenizer, CFG.max_length, is_train=True)
    valid_dataset = TrainDataset(validation_data, tokenizer, CFG.max_length, is_train=False)

    train_loader = DataLoader(
        train_dataset,
        batch_size=CFG.batch_size, 
        shuffle=True, 
        num_workers=CFG.num_workers, 
        pin_memory=True, 
        drop_last=True
        )
    valid_loader = DataLoader(
        valid_dataset, 
        batch_size=CFG.batch_size * 2, 
        shuffle=False, 
        num_workers=CFG.num_workers, 
        pin_memory=True, 
        drop_last=False
        )
    

    # initialize
    model = Model(CFG.base_model_name)
    model.to(device)
    criterion = get_criterion()

    optimizer = get_optimizer(model)
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    CFG.num_warmup_steps=num_train_steps*CFG.num_warmup_steps_ratio
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)
    # scheduler = get_scheduler(optimizer)
    best_score = 0
    best_loss = np.inf

    for epoch in range(CFG.start_epoch, CFG.start_epoch + CFG.epochs):
        start_time = time.time()
        # train
        avg_loss = train_fn(
            train_loader, 
            model, 
            criterion,
            optimizer, 
            epoch, 
            scheduler,
            device,
            )
        

        # eval
        preds = valid_fn(
            valid_loader,
            model,
            criterion, 
            device
            )

        # scoring
        validation_data[CFG.toxic_cols] = preds
        validation_data["pred"] = validation_data[CFG.toxic_cols].sum(axis=1)

        if 'less_toxic_preds' in validation.columns:
            validation = validation.drop(columns='less_toxic_preds')
        if 'more_toxic_preds' in validation.columns:
            validation = validation.drop(columns='more_toxic_preds')
        rename_cols = {"text": 'less_toxic', 'pred': 'less_toxic_preds'}
        validation_less = validation.merge(
            validation_data[["text"]+CFG.toxic_cols].rename(columns=rename_cols), 
            on='less_toxic', 
            how='left'
            )
        rename_cols = {"text": 'more_toxic', 'pred': 'more_toxic_preds'}
        validation_more = validation.merge(
            validation_data[["text"]+CFG.toxic_cols].rename(columns=rename_cols), 
            on='more_toxic', 
            how='left'
            )

        rename_cols = {"text": 'less_toxic', 'pred': 'less_toxic_preds'}
        validation = validation.merge(
            validation_data[["text", 'pred']].rename(columns=rename_cols), 
            on='less_toxic', 
            how='left'
            )
        rename_cols = {"text": 'more_toxic', 'pred': 'more_toxic_preds'}
        validation = validation.merge(
            validation_data[["text", 'pred']].rename(columns=rename_cols), 
            on='more_toxic', 
            how='left'
            )

        # scoring
        score = get_result(validation)

        elapsed = time.time() - start_time
        LOGGER.info(f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  time: {elapsed:.0f}s")
        LOGGER.info(f"Epoch {epoch+1} - Score: {score:.4f}")

        if score > best_score:
            best_score = score
            LOGGER.info(f"Epoch {epoch+1} - Save : {score:.4f} Model")
            torch.save({"model": model.state_dict()},
                        OUTPUT_DIR+f'{CFG.base_model_name}_best_score.pth')
            validation_last["more_toxic_preds"] = validation["more_toxic_preds"]
            validation_last["less_toxic_preds"] = validation["less_toxic_preds"]
    return validation_last, validation_less, validation_more


def main():
    seed_torch(seed=CFG.seed)
    train, validation_data, test, sub = read_processed_data()
    if CFG.debug:
        CFG.epochs = 1
        train = train.sample(n=1000, random_state=CFG.seed).reset_index(drop=True)
    oof_df = validation_data.copy()
    if CFG.train:
        # train 
        oof_more_toxic = np.zeros(len(validation_data))
        oof_less_toxic = np.zeros(len(validation_data))
        _oof_df, validation_less, validation_more = train_loop(train, validation_data)
        oof_more_toxic += (_oof_df["more_toxic_preds"].values)
        oof_less_toxic += (_oof_df["less_toxic_preds"].values)
        get_result(_oof_df)
        # CV result
        LOGGER.info(f"========== CV ==========")
        oof_df["more_toxic_preds"] = oof_more_toxic
        oof_df["less_toxic_preds"] = oof_less_toxic
        get_result(oof_df)
        # save result
        validation_less.to_csv(OUTPUT_DIR+"less_df.csv", index=False)
        validation_more.to_csv(OUTPUT_DIR+"more_df.csv", index=False)

In [None]:
if __name__ == "__main__":
    main()

Sampling: Train
shape: (215920, 9)
                 id  ... target
0  0000997932d777bf  ...      0
1  000103f0d9cfb60f  ...      0
2  000113f07ec002fd  ...      0
3  0001b41b1c6bb37e  ...      0
4  0001d958c54c6e35  ...      0

[5 rows x 9 columns]
Text cleaning...


  0%|          | 0/215920 [00:00<?, ?it/s]

  0%|          | 0/30108 [00:00<?, ?it/s]

  0%|          | 0/30108 [00:00<?, ?it/s]

Train Shape: (215920, 10)
Under Sampling
Valid Shape: (14220, 2)


Downloading:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Epoch: [1][0/1208] Data 0.963 (0.963) Elapsed 0m 2s (remain 46m 32s) Loss: 0.7037(0.7037) LR:  0.000000
Epoch: [1][100/1208] Data 0.000 (0.010) Elapsed 2m 10s (remain 23m 51s) Loss: 0.3234(0.5457) LR:  0.000003
Epoch: [1][200/1208] Data 0.000 (0.005) Elapsed 4m 19s (remain 21m 38s) Loss: 0.2096(0.3919) LR:  0.000006
Epoch: [1][300/1208] Data 0.000 (0.003) Elapsed 6m 27s (remain 19m 27s) Loss: 0.0846(0.3208) LR:  0.000008
Epoch: [1][400/1208] Data 0.000 (0.003) Elapsed 8m 36s (remain 17m 18s) Loss: 0.1725(0.2836) LR:  0.000010
Epoch: [1][500/1208] Data 0.000 (0.002) Elapsed 10m 44s (remain 15m 9s) Loss: 0.1792(0.2620) LR:  0.000010
Epoch: [1][600/1208] Data 0.000 (0.002) Elapsed 12m 52s (remain 13m 0s) Loss: 0.1972(0.2465) LR:  0.000010
Epoch: [1][700/1208] Data 0.000 (0.002) Elapsed 15m 1s (remain 10m 51s) Loss: 0.1996(0.2343) LR:  0.000010
Epoch: [1][800/1208] Data 0.000 (0.001) Elapsed 17m 9s (remain 8m 43s) Loss: 0.1907(0.2246) LR:  0.000010
Epoch: [1][900/1208] Data 0.000 (0.001) E

Score: 0.6996
Epoch 1 - avg_train_loss: 0.1990  time: 1727s
Epoch 1 - Score: 0.6996
Epoch 1 - Save : 0.6996 Model


Epoch: [2][0/1208] Data 0.914 (0.914) Elapsed 0m 2s (remain 44m 29s) Loss: 0.1334(0.1334) LR:  0.000008
Epoch: [2][100/1208] Data 0.000 (0.009) Elapsed 2m 10s (remain 23m 52s) Loss: 0.1505(0.1387) LR:  0.000008
Epoch: [2][200/1208] Data 0.000 (0.005) Elapsed 4m 19s (remain 21m 38s) Loss: 0.1665(0.1372) LR:  0.000008
Epoch: [2][300/1208] Data 0.000 (0.003) Elapsed 6m 27s (remain 19m 28s) Loss: 0.1382(0.1364) LR:  0.000007
Epoch: [2][400/1208] Data 0.000 (0.002) Elapsed 8m 36s (remain 17m 18s) Loss: 0.1373(0.1371) LR:  0.000007
Epoch: [2][500/1208] Data 0.000 (0.002) Elapsed 10m 44s (remain 15m 9s) Loss: 0.0814(0.1366) LR:  0.000006
Epoch: [2][600/1208] Data 0.000 (0.002) Elapsed 12m 52s (remain 13m 0s) Loss: 0.1365(0.1357) LR:  0.000006
Epoch: [2][700/1208] Data 0.000 (0.001) Elapsed 15m 1s (remain 10m 51s) Loss: 0.1770(0.1353) LR:  0.000005
Epoch: [2][800/1208] Data 0.000 (0.001) Elapsed 17m 9s (remain 8m 43s) Loss: 0.0916(0.1350) LR:  0.000005
Epoch: [2][900/1208] Data 0.000 (0.001) E

Score: 0.7032
Epoch 2 - avg_train_loss: 0.1332  time: 1727s
Epoch 2 - Score: 0.7032
Epoch 2 - Save : 0.7032 Model


Epoch: [3][0/1208] Data 0.935 (0.935) Elapsed 0m 2s (remain 44m 34s) Loss: 0.1028(0.1028) LR:  0.000003
Epoch: [3][100/1208] Data 0.000 (0.009) Elapsed 2m 10s (remain 23m 52s) Loss: 0.1178(0.1205) LR:  0.000003
Epoch: [3][200/1208] Data 0.000 (0.005) Elapsed 4m 19s (remain 21m 37s) Loss: 0.1434(0.1177) LR:  0.000002
Epoch: [3][300/1208] Data 0.000 (0.003) Elapsed 6m 27s (remain 19m 27s) Loss: 0.1309(0.1193) LR:  0.000002
Epoch: [3][400/1208] Data 0.000 (0.003) Elapsed 8m 35s (remain 17m 17s) Loss: 0.1228(0.1187) LR:  0.000001
Epoch: [3][500/1208] Data 0.000 (0.002) Elapsed 10m 44s (remain 15m 8s) Loss: 0.1439(0.1188) LR:  0.000001
Epoch: [3][600/1208] Data 0.000 (0.002) Elapsed 12m 52s (remain 13m 0s) Loss: 0.1245(0.1187) LR:  0.000001
Epoch: [3][700/1208] Data 0.000 (0.002) Elapsed 15m 0s (remain 10m 51s) Loss: 0.1274(0.1177) LR:  0.000001
Epoch: [3][800/1208] Data 0.000 (0.001) Elapsed 17m 9s (remain 8m 42s) Loss: 0.1259(0.1176) LR:  0.000000
Epoch: [3][900/1208] Data 0.000 (0.001) E

Score: 0.7028
Epoch 3 - avg_train_loss: 0.1169  time: 1726s
Epoch 3 - Score: 0.7028
Score: 0.7032
Score: 0.7032


In [7]:
import os
import sys
import numpy as np
import pandas as pd
import math
import time
import random
import shutil
import copy
import collections
from pathlib import Path
from contextlib import contextmanager
from collections import defaultdict, Counter
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from tqdm.auto import tqdm
from functools import partial
import torch
import torch.nn as nn
from torch.nn import MarginRankingLoss
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD
import torchvision.models as models
from torch.nn.parameter import Parameter
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingLR
import transformers
from transformers import (AutoModel, AutoTokenizer)
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
import warnings
warnings.filterwarnings("ignore")
from torch.cuda.amp import autocast, GradScaler
import re
from bs4 import BeautifulSoup
tqdm.pandas()

device = ("cuda" if torch.cuda.is_available() else "cpu")


# ====================================================
# CFG
# ====================================================
class CFG:
    ######################
    # Globals #
    ######################
    debug = False
    exp_name = "exp1004_seed1"
    fold_seed = 0
    seed = 1
    start_epoch = 0
    epochs = 3
    train = True
    folds = [0,1,2,3,4]
    n_fold = 5
    print_freq = 100
    use_amp = True
    toxic_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    ######################
    # Dataset #
    ######################
    head = 64
    tail = 64
    max_length = head+tail
    under_sampling = True
    under_sampling_ratio = 0.1
    ######################
    # Augmentation #
    ######################

    ######################
    # Loaders #
    ######################
    batch_size = 32
    num_workers = 8
    ######################
    # Model #
    ######################
    # https://huggingface.co/deepset/xlm-roberta-base-squad2
    base_model_name = "roberta-large"
    pretrained = True
    num_classes = len(toxic_cols)  # Binary 
    hidden_node = 1024  # large: 1024, base: 768
    ######################
    # Criterion #
    ######################
    loss_name = "BCEWithLogitsLoss"
    loss_params: dict = {}
    ######################
    # Optimizer #
    ######################
    optimizer_name = "AdamW"
    optimizer_params = {
        "lr": 1e-5,
    }
    ######################
    # Scheduler #
    ######################
    scheduler = "cosine"
    num_cycles=0.5
    num_warmup_steps_ratio = 0.1
    

# ====================================================
# Directory settings
# ====================================================
INPUT_PATH = "../input/"
OUTPUT_DIR = f'../output/{CFG.exp_name}/' 
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)


# ====================================================
# Utils
# ====================================================

def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


def init_logger(log_file=OUTPUT_DIR+"train.log"):
    from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger
    
LOGGER = init_logger()

def get_score(more_toxic_preds, less_toxic_preds):
    score = np.mean(more_toxic_preds > less_toxic_preds)
    return score



def get_result(df):
    more_toxic_preds = df["more_toxic_preds"].values
    less_toxic_preds = df["less_toxic_preds"].values
    score = get_score(more_toxic_preds, less_toxic_preds)
    LOGGER.info(f"Score: {score:<.4f}")
    return score


def data2017_preprocess(train_1st, test_1st, test_labels_1st, val_data):
    
    test_1st_l = pd.merge(test_1st, test_labels_1st, on='id', how='left').query("toxic != -1")
    train_src = pd.concat([train_1st, test_1st_l], axis='rows')
    # judge whether toxic or not in target col
    train_src['target'] = train_src[CFG.toxic_cols].values.max(axis=1)
    val_comment_unq = pd.concat([val_data['less_toxic'], val_data['more_toxic']]).unique()
    train = train_src[~train_src['comment_text'].isin(val_comment_unq)]
    print('Sampling: Train')
    print(f'shape: {train.shape}')
    return train

def read_data():
    validation_data = pd.read_csv(INPUT_PATH + "validation_data.csv")
    test = pd.read_csv(INPUT_PATH + "comments_to_score.csv")
    sub = pd.read_csv(INPUT_PATH + "sample_submission.csv")
    return validation_data, test, sub

def read_processed_data():
    validation_data = pd.read_csv(INPUT_PATH + "validation_data.csv")
    test = pd.read_csv(INPUT_PATH + "comments_to_score.csv")
    sub = pd.read_csv(INPUT_PATH + "sample_submission.csv")
    train2017 = pd.read_csv("../input/jigsaw_1st/train.csv")
    test2017 = pd.read_csv('../input/jigsaw_1st/test.csv')
    test_labels2017 = pd.read_csv('../input/jigsaw_1st/test_labels.csv')
    train2017 = data2017_preprocess(train2017, test2017, test_labels2017, validation_data)
    print(train2017.head())
    return train2017, validation_data, test, sub


def text_cleaning(text):
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    template = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+") #Removes e-mail address
    text = template.sub(r'.', text)
    # text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    ipPattern = re.compile('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}') # Removes IP address
    text = ipPattern.sub(r'', text)
    bikkuri = re.compile('!') # Removes bikkuri
    text = bikkuri.sub(r' ', text)
    text = text.replace('\n','')
    text = text.replace("\'","")
    text = text.replace("|","")
    text = text.replace("=","")
    text = text.replace("F**K", "FUCK")
    text = text.replace("F__K", "FUCK")
    text = text.replace("f**k", "fuck")
    text = text.replace("f__k", "fuck")
    text = text.replace("f*ck", "fuck")    
    text = text.replace("S$X", "SEX")
    text = text.replace("s$x", "sex")
    text = text.replace(" u ", " you ")
    text = text.replace(" u ", " you ")
    text = text.replace(" U ", " you ")
    text = text.replace(" U ", " you ")
    text = text.replace("YOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOUUUUUUUUUUUUUUUUUUUU", "YOU")
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string
    return text

# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))


def prepare_input(text, tokenizer):
    if CFG.tail == 0:
        inputs = tokenizer.encode_plus(
            text, 
            return_tensors=None, 
            add_special_tokens=True, 
            max_length=CFG.max_length,
            pad_to_max_length=True,
            truncation=True
            )
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
    else:
        inputs = tokenizer.encode_plus(
            text,
            return_tensors=None, 
            add_special_tokens=True, 
            truncation=True
            )
        for k, v in inputs.items():
            v_length = len(v)
            if v_length > CFG.max_length:
                v = np.hstack([v[:CFG.head], v[-CFG.tail:]])
            if k == 'input_ids':
                new_v = np.ones(CFG.max_length) * tokenizer.pad_token_id
            else:
                new_v = np.zeros(CFG.max_length)
            new_v[:v_length] = v 
            inputs[k] = torch.tensor(new_v, dtype=torch.long)
    return inputs


# https://zenn.dev/hellorusk/articles/7fd588cae5b173
# huggingface Tokenizer の tokenize, encode, encode_plus などの違い
class TrainDataset(Dataset):
    def __init__(self, df, tokenizer, max_length, is_train=True):
        self.df = df
        self.tokenizer = tokenizer
        self.is_train = is_train
        self.max_length = max_length
        self.text = df["text"].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):   
        text = self.text[idx]
        inputs = prepare_input(str(text), self.tokenizer)
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        if self.is_train:
            label = self.df[CFG.toxic_cols].values[idx]
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'label': torch.tensor(label, dtype=torch.float),
            }
        else:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
            }

# ====================================================
# LOSS
# ====================================================

__CRITERIONS__ = {}

def get_criterion():
    if hasattr(nn, CFG.loss_name):
        return nn.__getattribute__(CFG.loss_name)(**CFG.loss_params)
    elif __CRITERIONS__.get(CFG.loss_name) is not None:
        return __CRITERIONS__[CFG.loss_name](**CFG.loss_params)
    else:
        raise NotImplementedError

# ====================================================
# Train
# ====================================================

# Custom optimizer
__OPTIMIZERS__ = {}


def get_optimizer(model: nn.Module):
    optimizer_name = CFG.optimizer_name
    if __OPTIMIZERS__.get(optimizer_name) is not None:
        return __OPTIMIZERS__[optimizer_name](model.parameters(), **CFG.optimizer_params)
    else:
        return optim.__getattribute__(optimizer_name)(model.parameters(), **CFG.optimizer_params)


# def get_scheduler(optimizer):
#     scheduler_name = CFG.scheduler_name

#     if scheduler_name is None:
#         return
#     else:
#         return optim.lr_scheduler.__getattribute__(scheduler_name)(optimizer, **CFG.scheduler_params)

def get_scheduler(cfg, optimizer, num_train_steps):
    if cfg.scheduler=='linear':
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
        )
    elif cfg.scheduler=='cosine':
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
        )
    return scheduler

class Model(nn.Module):
    def __init__(self, modelname_or_path):
        super(Model, self).__init__()
        self.base_model = AutoModel.from_pretrained(modelname_or_path)
        self.fc = nn.Linear(CFG.hidden_node, CFG.num_classes)
        self.dropout = nn.Dropout(p=0.5)
        # self.ln = nn.LayerNorm(CFG.hidden_node)
        
    def feature(self, input_ids, attention_mask):
        outputs = self.base_model(
            input_ids,
            attention_mask=attention_mask,
            output_hidden_states=False
        )
        last_hidden_states = outputs[0]
        feature = torch.mean(last_hidden_states, 1)
        return feature

    def forward(self, input_ids, attention_mask=None):
        feature = self.feature(input_ids, attention_mask)
        output = self.fc(self.dropout(feature))
        return output




def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler,device):
    if CFG.use_amp:
        scaler = GradScaler()
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()
    # switch to train mode
    model.train()
    start = end = time.time()
    for step, batch_data in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        ids = batch_data['ids'].to(device)
        mask = batch_data['mask'].to(device)
        label = batch_data['label'].to(device)
        batch_size = ids.size(0)
        if CFG.use_amp:
            with autocast():
                outputs = model(input_ids=ids, attention_mask=mask)
                loss = criterion(outputs, label)
        else:
            outputs = model(input_ids=ids, attention_mask=mask)
            loss = criterion(outputs, label)
        # record loss
        losses.update(loss.item(), batch_size)
        if CFG.use_amp:
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            loss.backward()
            optimizer.step()
        optimizer.zero_grad()
        if scheduler is not None:
            scheduler.step()
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print(
                "Epoch: [{0}][{1}/{2}] "
                "Data {data_time.val:.3f} ({data_time.avg:.3f}) "
                "Elapsed {remain:s} "
                "Loss: {loss.val:.4f}({loss.avg:.4f}) "
                "LR: {lr: 8f}"
                .format(
                    epoch+1, step, len(train_loader), batch_time=batch_time,
                    data_time=data_time, loss=losses,
                    remain=timeSince(start, float(step+1)/len(train_loader)),
                    lr=scheduler.get_lr()[0]
                    )
                )
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    # switch to evaluation mode
    model.eval()
    preds = []
    start = end = time.time()
    for step, batch_data in enumerate(valid_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        ids = batch_data['ids'].to(device)
        mask = batch_data['mask'].to(device)
        batch_size = ids.size(0)
        # compute loss
        with torch.no_grad():
            outputs = model(input_ids=ids, attention_mask=mask)
        preds.append(outputs.to('cpu').numpy()) 
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print(
                "EVAL: [{0}/{1}] "
                "Data {data_time.val:.3f} ({data_time.avg:.3f}) "
                "Elapsed {remain:s} "
                .format(
                    step, len(valid_loader), batch_time=batch_time,
                    data_time=data_time,
                    remain=timeSince(start, float(step+1)/len(valid_loader)),
                    )
                )
    predictions = np.concatenate(preds)
    return predictions


def train_loop(folds, validation):
    validation_last = validation.copy()
    # ====================================================
    # loader
    # ====================================================
    train_folds = folds.reset_index(drop=True)
    
    print("Text cleaning...")
    train_folds['text'] = train_folds['comment_text'].progress_apply(text_cleaning)
    validation['less_toxic'] = validation['less_toxic'].progress_apply(text_cleaning)
    validation['more_toxic'] = validation['more_toxic'].progress_apply(text_cleaning)
    
    print("Train Shape:", train_folds.shape)
    if CFG.under_sampling:
        print("Under Sampling")
        train_folds_0 = train_folds[train_folds["target"]==0]
        train_folds_0 = train_folds_0.sample(frac=CFG.under_sampling_ratio, random_state=CFG.seed)
        train_folds_1 = train_folds[train_folds["target"]>0]
        train_folds = pd.concat([train_folds_1, train_folds_0], axis=0).reset_index(drop=True)
    
    validation_data = sorted(set(validation['less_toxic'].unique()) | set(validation['more_toxic'].unique()))
    validation_data = pd.DataFrame({'text': validation_data}).reset_index()
    print("Valid Shape:", validation_data.shape)
    
    tokenizer = AutoTokenizer.from_pretrained(CFG.base_model_name)
        
    train_dataset = TrainDataset(train_folds, tokenizer, CFG.max_length, is_train=True)
    valid_dataset = TrainDataset(validation_data, tokenizer, CFG.max_length, is_train=False)

    train_loader = DataLoader(
        train_dataset,
        batch_size=CFG.batch_size, 
        shuffle=True, 
        num_workers=CFG.num_workers, 
        pin_memory=True, 
        drop_last=True
        )
    valid_loader = DataLoader(
        valid_dataset, 
        batch_size=CFG.batch_size * 2, 
        shuffle=False, 
        num_workers=CFG.num_workers, 
        pin_memory=True, 
        drop_last=False
        )
    

    # initialize
    model = Model(CFG.base_model_name)
    model.to(device)
    criterion = get_criterion()

    optimizer = get_optimizer(model)
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    CFG.num_warmup_steps=num_train_steps*CFG.num_warmup_steps_ratio
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)
    # scheduler = get_scheduler(optimizer)
    best_score = 0
    best_loss = np.inf

    for epoch in range(CFG.start_epoch, CFG.start_epoch + CFG.epochs):
        start_time = time.time()
        # train
        avg_loss = train_fn(
            train_loader, 
            model, 
            criterion,
            optimizer, 
            epoch, 
            scheduler,
            device,
            )
        

        # eval
        preds = valid_fn(
            valid_loader,
            model,
            criterion, 
            device
            )

        # scoring
        validation_data[CFG.toxic_cols] = preds
        validation_data["pred"] = validation_data[CFG.toxic_cols].sum(axis=1)

        if 'less_toxic_preds' in validation.columns:
            validation = validation.drop(columns='less_toxic_preds')
        if 'more_toxic_preds' in validation.columns:
            validation = validation.drop(columns='more_toxic_preds')
        rename_cols = {"text": 'less_toxic', 'pred': 'less_toxic_preds'}
        validation_less = validation.merge(
            validation_data[["text"]+CFG.toxic_cols].rename(columns=rename_cols), 
            on='less_toxic', 
            how='left'
            )
        rename_cols = {"text": 'more_toxic', 'pred': 'more_toxic_preds'}
        validation_more = validation.merge(
            validation_data[["text"]+CFG.toxic_cols].rename(columns=rename_cols), 
            on='more_toxic', 
            how='left'
            )

        rename_cols = {"text": 'less_toxic', 'pred': 'less_toxic_preds'}
        validation = validation.merge(
            validation_data[["text", 'pred']].rename(columns=rename_cols), 
            on='less_toxic', 
            how='left'
            )
        rename_cols = {"text": 'more_toxic', 'pred': 'more_toxic_preds'}
        validation = validation.merge(
            validation_data[["text", 'pred']].rename(columns=rename_cols), 
            on='more_toxic', 
            how='left'
            )

        # scoring
        score = get_result(validation)

        elapsed = time.time() - start_time
        LOGGER.info(f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  time: {elapsed:.0f}s")
        LOGGER.info(f"Epoch {epoch+1} - Score: {score:.4f}")

        if score > best_score:
            best_score = score
            LOGGER.info(f"Epoch {epoch+1} - Save : {score:.4f} Model")
            torch.save({"model": model.state_dict()},
                        OUTPUT_DIR+f'{CFG.base_model_name}_best_score.pth')
            validation_last["more_toxic_preds"] = validation["more_toxic_preds"]
            validation_last["less_toxic_preds"] = validation["less_toxic_preds"]
    return validation_last, validation_less, validation_more


def main():
    seed_torch(seed=CFG.seed)
    train, validation_data, test, sub = read_processed_data()
    if CFG.debug:
        CFG.epochs = 1
        train = train.sample(n=1000, random_state=CFG.seed).reset_index(drop=True)
    oof_df = validation_data.copy()
    if CFG.train:
        # train 
        oof_more_toxic = np.zeros(len(validation_data))
        oof_less_toxic = np.zeros(len(validation_data))
        _oof_df, validation_less, validation_more = train_loop(train, validation_data)
        oof_more_toxic += (_oof_df["more_toxic_preds"].values)
        oof_less_toxic += (_oof_df["less_toxic_preds"].values)
        get_result(_oof_df)
        # CV result
        LOGGER.info(f"========== CV ==========")
        oof_df["more_toxic_preds"] = oof_more_toxic
        oof_df["less_toxic_preds"] = oof_less_toxic
        get_result(oof_df)
        # save result
        validation_less.to_csv(OUTPUT_DIR+"less_df.csv", index=False)
        validation_more.to_csv(OUTPUT_DIR+"more_df.csv", index=False)

In [8]:
if __name__ == "__main__":
    main()

Sampling: Train
shape: (215920, 9)
                 id  ... target
0  0000997932d777bf  ...      0
1  000103f0d9cfb60f  ...      0
2  000113f07ec002fd  ...      0
3  0001b41b1c6bb37e  ...      0
4  0001d958c54c6e35  ...      0

[5 rows x 9 columns]
Text cleaning...


  0%|          | 0/215920 [00:00<?, ?it/s]

  0%|          | 0/30108 [00:00<?, ?it/s]

  0%|          | 0/30108 [00:00<?, ?it/s]

Train Shape: (215920, 10)
Under Sampling
Valid Shape: (14220, 2)


Downloading:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Epoch: [1][0/1208] Data 0.613 (0.613) Elapsed 0m 1s (remain 27m 31s) Loss: 0.7609(0.7609) LR:  0.000000
Epoch: [1][100/1208] Data 0.000 (0.006) Elapsed 1m 15s (remain 13m 42s) Loss: 0.2864(0.5884) LR:  0.000003
Epoch: [1][200/1208] Data 0.000 (0.003) Elapsed 2m 34s (remain 12m 55s) Loss: 0.1876(0.4042) LR:  0.000006
Epoch: [1][300/1208] Data 0.000 (0.002) Elapsed 3m 55s (remain 11m 50s) Loss: 0.1366(0.3307) LR:  0.000008
Epoch: [1][400/1208] Data 0.000 (0.002) Elapsed 5m 16s (remain 10m 37s) Loss: 0.2488(0.2904) LR:  0.000010
Epoch: [1][500/1208] Data 0.000 (0.001) Elapsed 6m 38s (remain 9m 22s) Loss: 0.2097(0.2656) LR:  0.000010
Epoch: [1][600/1208] Data 0.000 (0.001) Elapsed 7m 59s (remain 8m 4s) Loss: 0.2175(0.2480) LR:  0.000010
Epoch: [1][700/1208] Data 0.000 (0.001) Elapsed 9m 20s (remain 6m 45s) Loss: 0.1404(0.2340) LR:  0.000010
Epoch: [1][800/1208] Data 0.000 (0.001) Elapsed 10m 41s (remain 5m 26s) Loss: 0.2323(0.2239) LR:  0.000010
Epoch: [1][900/1208] Data 0.000 (0.001) Elap

Score: 0.6996
Epoch 1 - avg_train_loss: 0.1988  time: 1308s
Epoch 1 - Score: 0.6996
Epoch 1 - Save : 0.6996 Model


Epoch: [2][0/1208] Data 1.196 (1.196) Elapsed 0m 1s (remain 40m 4s) Loss: 0.1384(0.1384) LR:  0.000008
Epoch: [2][100/1208] Data 0.000 (0.012) Elapsed 1m 23s (remain 15m 13s) Loss: 0.2053(0.1408) LR:  0.000008
Epoch: [2][200/1208] Data 0.000 (0.006) Elapsed 2m 44s (remain 13m 45s) Loss: 0.0925(0.1375) LR:  0.000008
Epoch: [2][300/1208] Data 0.000 (0.004) Elapsed 4m 5s (remain 12m 20s) Loss: 0.1185(0.1359) LR:  0.000007
Epoch: [2][400/1208] Data 0.000 (0.003) Elapsed 5m 27s (remain 10m 58s) Loss: 0.1185(0.1343) LR:  0.000007
Epoch: [2][500/1208] Data 0.000 (0.003) Elapsed 6m 48s (remain 9m 36s) Loss: 0.0897(0.1329) LR:  0.000006
Epoch: [2][600/1208] Data 0.000 (0.002) Elapsed 8m 9s (remain 8m 14s) Loss: 0.1212(0.1329) LR:  0.000006
Epoch: [2][700/1208] Data 0.000 (0.002) Elapsed 9m 30s (remain 6m 52s) Loss: 0.1209(0.1329) LR:  0.000005
Epoch: [2][800/1208] Data 0.000 (0.002) Elapsed 10m 51s (remain 5m 31s) Loss: 0.0929(0.1326) LR:  0.000005
Epoch: [2][900/1208] Data 0.000 (0.001) Elapse

Score: 0.7014
Epoch 2 - avg_train_loss: 0.1319  time: 1316s
Epoch 2 - Score: 0.7014
Epoch 2 - Save : 0.7014 Model


Epoch: [3][0/1208] Data 1.055 (1.055) Elapsed 0m 1s (remain 38m 4s) Loss: 0.1236(0.1236) LR:  0.000003
Epoch: [3][100/1208] Data 0.000 (0.011) Elapsed 1m 23s (remain 15m 11s) Loss: 0.1313(0.1168) LR:  0.000003
Epoch: [3][200/1208] Data 0.000 (0.005) Elapsed 2m 44s (remain 13m 44s) Loss: 0.1346(0.1168) LR:  0.000002
Epoch: [3][300/1208] Data 0.000 (0.004) Elapsed 4m 5s (remain 12m 19s) Loss: 0.1013(0.1182) LR:  0.000002
Epoch: [3][400/1208] Data 0.000 (0.003) Elapsed 5m 26s (remain 10m 56s) Loss: 0.1261(0.1182) LR:  0.000001
Epoch: [3][500/1208] Data 0.000 (0.002) Elapsed 6m 47s (remain 9m 34s) Loss: 0.1350(0.1183) LR:  0.000001
Epoch: [3][600/1208] Data 0.000 (0.002) Elapsed 8m 8s (remain 8m 13s) Loss: 0.1250(0.1169) LR:  0.000001
Epoch: [3][700/1208] Data 0.000 (0.002) Elapsed 9m 29s (remain 6m 51s) Loss: 0.1478(0.1171) LR:  0.000001
Epoch: [3][800/1208] Data 0.000 (0.001) Elapsed 10m 49s (remain 5m 30s) Loss: 0.1526(0.1176) LR:  0.000000
Epoch: [3][900/1208] Data 0.000 (0.001) Elapse

Score: 0.7039
Epoch 3 - avg_train_loss: 0.1162  time: 1314s
Epoch 3 - Score: 0.7039
Epoch 3 - Save : 0.7039 Model
Score: 0.7039
Score: 0.7039


In [9]:
import os
import sys
import numpy as np
import pandas as pd
import math
import time
import random
import shutil
import copy
import collections
from pathlib import Path
from contextlib import contextmanager
from collections import defaultdict, Counter
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from tqdm.auto import tqdm
from functools import partial
import torch
import torch.nn as nn
from torch.nn import MarginRankingLoss
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD
import torchvision.models as models
from torch.nn.parameter import Parameter
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import CosineAnnealingLR
import transformers
from transformers import (AutoModel, AutoTokenizer)
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
import warnings
warnings.filterwarnings("ignore")
from torch.cuda.amp import autocast, GradScaler
import re
from bs4 import BeautifulSoup
tqdm.pandas()

device = ("cuda" if torch.cuda.is_available() else "cpu")


# ====================================================
# CFG
# ====================================================
class CFG:
    ######################
    # Globals #
    ######################
    debug = False
    exp_name = "exp1004_seed2"
    fold_seed = 0
    seed = 2
    start_epoch = 0
    epochs = 3
    train = True
    folds = [0,1,2,3,4]
    n_fold = 5
    print_freq = 100
    use_amp = True
    toxic_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    ######################
    # Dataset #
    ######################
    head = 64
    tail = 64
    max_length = head+tail
    under_sampling = True
    under_sampling_ratio = 0.1
    ######################
    # Augmentation #
    ######################

    ######################
    # Loaders #
    ######################
    batch_size = 32
    num_workers = 8
    ######################
    # Model #
    ######################
    # https://huggingface.co/deepset/xlm-roberta-base-squad2
    base_model_name = "roberta-large"
    pretrained = True
    num_classes = len(toxic_cols)  # Binary 
    hidden_node = 1024  # large: 1024, base: 768
    ######################
    # Criterion #
    ######################
    loss_name = "BCEWithLogitsLoss"
    loss_params: dict = {}
    ######################
    # Optimizer #
    ######################
    optimizer_name = "AdamW"
    optimizer_params = {
        "lr": 1e-5,
    }
    ######################
    # Scheduler #
    ######################
    scheduler = "cosine"
    num_cycles=0.5
    num_warmup_steps_ratio = 0.1
    

# ====================================================
# Directory settings
# ====================================================
INPUT_PATH = "../input/"
OUTPUT_DIR = f'../output/{CFG.exp_name}/' 
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)


# ====================================================
# Utils
# ====================================================

def seed_torch(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


def init_logger(log_file=OUTPUT_DIR+"train.log"):
    from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=log_file)
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger
    
LOGGER = init_logger()

def get_score(more_toxic_preds, less_toxic_preds):
    score = np.mean(more_toxic_preds > less_toxic_preds)
    return score



def get_result(df):
    more_toxic_preds = df["more_toxic_preds"].values
    less_toxic_preds = df["less_toxic_preds"].values
    score = get_score(more_toxic_preds, less_toxic_preds)
    LOGGER.info(f"Score: {score:<.4f}")
    return score


def data2017_preprocess(train_1st, test_1st, test_labels_1st, val_data):
    
    test_1st_l = pd.merge(test_1st, test_labels_1st, on='id', how='left').query("toxic != -1")
    train_src = pd.concat([train_1st, test_1st_l], axis='rows')
    # judge whether toxic or not in target col
    train_src['target'] = train_src[CFG.toxic_cols].values.max(axis=1)
    val_comment_unq = pd.concat([val_data['less_toxic'], val_data['more_toxic']]).unique()
    train = train_src[~train_src['comment_text'].isin(val_comment_unq)]
    print('Sampling: Train')
    print(f'shape: {train.shape}')
    return train

def read_data():
    validation_data = pd.read_csv(INPUT_PATH + "validation_data.csv")
    test = pd.read_csv(INPUT_PATH + "comments_to_score.csv")
    sub = pd.read_csv(INPUT_PATH + "sample_submission.csv")
    return validation_data, test, sub

def read_processed_data():
    validation_data = pd.read_csv(INPUT_PATH + "validation_data.csv")
    test = pd.read_csv(INPUT_PATH + "comments_to_score.csv")
    sub = pd.read_csv(INPUT_PATH + "sample_submission.csv")
    train2017 = pd.read_csv("../input/jigsaw_1st/train.csv")
    test2017 = pd.read_csv('../input/jigsaw_1st/test.csv')
    test_labels2017 = pd.read_csv('../input/jigsaw_1st/test_labels.csv')
    train2017 = data2017_preprocess(train2017, test2017, test_labels2017, validation_data)
    print(train2017.head())
    return train2017, validation_data, test, sub


def text_cleaning(text):
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    template = re.compile(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+") #Removes e-mail address
    text = template.sub(r'.', text)
    # text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    ipPattern = re.compile('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}') # Removes IP address
    text = ipPattern.sub(r'', text)
    bikkuri = re.compile('!') # Removes bikkuri
    text = bikkuri.sub(r' ', text)
    text = text.replace('\n','')
    text = text.replace("\'","")
    text = text.replace("|","")
    text = text.replace("=","")
    text = text.replace("F**K", "FUCK")
    text = text.replace("F__K", "FUCK")
    text = text.replace("f**k", "fuck")
    text = text.replace("f__k", "fuck")
    text = text.replace("f*ck", "fuck")    
    text = text.replace("S$X", "SEX")
    text = text.replace("s$x", "sex")
    text = text.replace(" u ", " you ")
    text = text.replace(" u ", " you ")
    text = text.replace(" U ", " you ")
    text = text.replace(" U ", " you ")
    text = text.replace("YOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOUUUUUUUUUUUUUUUUUUUU", "YOU")
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string
    return text

# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))


def prepare_input(text, tokenizer):
    if CFG.tail == 0:
        inputs = tokenizer.encode_plus(
            text, 
            return_tensors=None, 
            add_special_tokens=True, 
            max_length=CFG.max_length,
            pad_to_max_length=True,
            truncation=True
            )
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
    else:
        inputs = tokenizer.encode_plus(
            text,
            return_tensors=None, 
            add_special_tokens=True, 
            truncation=True
            )
        for k, v in inputs.items():
            v_length = len(v)
            if v_length > CFG.max_length:
                v = np.hstack([v[:CFG.head], v[-CFG.tail:]])
            if k == 'input_ids':
                new_v = np.ones(CFG.max_length) * tokenizer.pad_token_id
            else:
                new_v = np.zeros(CFG.max_length)
            new_v[:v_length] = v 
            inputs[k] = torch.tensor(new_v, dtype=torch.long)
    return inputs


# https://zenn.dev/hellorusk/articles/7fd588cae5b173
# huggingface Tokenizer の tokenize, encode, encode_plus などの違い
class TrainDataset(Dataset):
    def __init__(self, df, tokenizer, max_length, is_train=True):
        self.df = df
        self.tokenizer = tokenizer
        self.is_train = is_train
        self.max_length = max_length
        self.text = df["text"].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):   
        text = self.text[idx]
        inputs = prepare_input(str(text), self.tokenizer)
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        if self.is_train:
            label = self.df[CFG.toxic_cols].values[idx]
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'label': torch.tensor(label, dtype=torch.float),
            }
        else:
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
            }

# ====================================================
# LOSS
# ====================================================

__CRITERIONS__ = {}

def get_criterion():
    if hasattr(nn, CFG.loss_name):
        return nn.__getattribute__(CFG.loss_name)(**CFG.loss_params)
    elif __CRITERIONS__.get(CFG.loss_name) is not None:
        return __CRITERIONS__[CFG.loss_name](**CFG.loss_params)
    else:
        raise NotImplementedError

# ====================================================
# Train
# ====================================================

# Custom optimizer
__OPTIMIZERS__ = {}


def get_optimizer(model: nn.Module):
    optimizer_name = CFG.optimizer_name
    if __OPTIMIZERS__.get(optimizer_name) is not None:
        return __OPTIMIZERS__[optimizer_name](model.parameters(), **CFG.optimizer_params)
    else:
        return optim.__getattribute__(optimizer_name)(model.parameters(), **CFG.optimizer_params)


# def get_scheduler(optimizer):
#     scheduler_name = CFG.scheduler_name

#     if scheduler_name is None:
#         return
#     else:
#         return optim.lr_scheduler.__getattribute__(scheduler_name)(optimizer, **CFG.scheduler_params)

def get_scheduler(cfg, optimizer, num_train_steps):
    if cfg.scheduler=='linear':
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
        )
    elif cfg.scheduler=='cosine':
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
        )
    return scheduler

class Model(nn.Module):
    def __init__(self, modelname_or_path):
        super(Model, self).__init__()
        self.base_model = AutoModel.from_pretrained(modelname_or_path)
        self.fc = nn.Linear(CFG.hidden_node, CFG.num_classes)
        self.dropout = nn.Dropout(p=0.5)
        # self.ln = nn.LayerNorm(CFG.hidden_node)
        
    def feature(self, input_ids, attention_mask):
        outputs = self.base_model(
            input_ids,
            attention_mask=attention_mask,
            output_hidden_states=False
        )
        last_hidden_states = outputs[0]
        feature = torch.mean(last_hidden_states, 1)
        return feature

    def forward(self, input_ids, attention_mask=None):
        feature = self.feature(input_ids, attention_mask)
        output = self.fc(self.dropout(feature))
        return output




def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler,device):
    if CFG.use_amp:
        scaler = GradScaler()
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    scores = AverageMeter()
    # switch to train mode
    model.train()
    start = end = time.time()
    for step, batch_data in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        ids = batch_data['ids'].to(device)
        mask = batch_data['mask'].to(device)
        label = batch_data['label'].to(device)
        batch_size = ids.size(0)
        if CFG.use_amp:
            with autocast():
                outputs = model(input_ids=ids, attention_mask=mask)
                loss = criterion(outputs, label)
        else:
            outputs = model(input_ids=ids, attention_mask=mask)
            loss = criterion(outputs, label)
        # record loss
        losses.update(loss.item(), batch_size)
        if CFG.use_amp:
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            loss.backward()
            optimizer.step()
        optimizer.zero_grad()
        if scheduler is not None:
            scheduler.step()
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print(
                "Epoch: [{0}][{1}/{2}] "
                "Data {data_time.val:.3f} ({data_time.avg:.3f}) "
                "Elapsed {remain:s} "
                "Loss: {loss.val:.4f}({loss.avg:.4f}) "
                "LR: {lr: 8f}"
                .format(
                    epoch+1, step, len(train_loader), batch_time=batch_time,
                    data_time=data_time, loss=losses,
                    remain=timeSince(start, float(step+1)/len(train_loader)),
                    lr=scheduler.get_lr()[0]
                    )
                )
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    # switch to evaluation mode
    model.eval()
    preds = []
    start = end = time.time()
    for step, batch_data in enumerate(valid_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        ids = batch_data['ids'].to(device)
        mask = batch_data['mask'].to(device)
        batch_size = ids.size(0)
        # compute loss
        with torch.no_grad():
            outputs = model(input_ids=ids, attention_mask=mask)
        preds.append(outputs.to('cpu').numpy()) 
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print(
                "EVAL: [{0}/{1}] "
                "Data {data_time.val:.3f} ({data_time.avg:.3f}) "
                "Elapsed {remain:s} "
                .format(
                    step, len(valid_loader), batch_time=batch_time,
                    data_time=data_time,
                    remain=timeSince(start, float(step+1)/len(valid_loader)),
                    )
                )
    predictions = np.concatenate(preds)
    return predictions


def train_loop(folds, validation):
    validation_last = validation.copy()
    # ====================================================
    # loader
    # ====================================================
    train_folds = folds.reset_index(drop=True)
    
    print("Text cleaning...")
    train_folds['text'] = train_folds['comment_text'].progress_apply(text_cleaning)
    validation['less_toxic'] = validation['less_toxic'].progress_apply(text_cleaning)
    validation['more_toxic'] = validation['more_toxic'].progress_apply(text_cleaning)
    
    print("Train Shape:", train_folds.shape)
    if CFG.under_sampling:
        print("Under Sampling")
        train_folds_0 = train_folds[train_folds["target"]==0]
        train_folds_0 = train_folds_0.sample(frac=CFG.under_sampling_ratio, random_state=CFG.seed)
        train_folds_1 = train_folds[train_folds["target"]>0]
        train_folds = pd.concat([train_folds_1, train_folds_0], axis=0).reset_index(drop=True)
    
    validation_data = sorted(set(validation['less_toxic'].unique()) | set(validation['more_toxic'].unique()))
    validation_data = pd.DataFrame({'text': validation_data}).reset_index()
    print("Valid Shape:", validation_data.shape)
    
    tokenizer = AutoTokenizer.from_pretrained(CFG.base_model_name)
        
    train_dataset = TrainDataset(train_folds, tokenizer, CFG.max_length, is_train=True)
    valid_dataset = TrainDataset(validation_data, tokenizer, CFG.max_length, is_train=False)

    train_loader = DataLoader(
        train_dataset,
        batch_size=CFG.batch_size, 
        shuffle=True, 
        num_workers=CFG.num_workers, 
        pin_memory=True, 
        drop_last=True
        )
    valid_loader = DataLoader(
        valid_dataset, 
        batch_size=CFG.batch_size * 2, 
        shuffle=False, 
        num_workers=CFG.num_workers, 
        pin_memory=True, 
        drop_last=False
        )
    

    # initialize
    model = Model(CFG.base_model_name)
    model.to(device)
    criterion = get_criterion()

    optimizer = get_optimizer(model)
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    CFG.num_warmup_steps=num_train_steps*CFG.num_warmup_steps_ratio
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)
    # scheduler = get_scheduler(optimizer)
    best_score = 0
    best_loss = np.inf

    for epoch in range(CFG.start_epoch, CFG.start_epoch + CFG.epochs):
        start_time = time.time()
        # train
        avg_loss = train_fn(
            train_loader, 
            model, 
            criterion,
            optimizer, 
            epoch, 
            scheduler,
            device,
            )
        

        # eval
        preds = valid_fn(
            valid_loader,
            model,
            criterion, 
            device
            )

        # scoring
        validation_data[CFG.toxic_cols] = preds
        validation_data["pred"] = validation_data[CFG.toxic_cols].sum(axis=1)

        if 'less_toxic_preds' in validation.columns:
            validation = validation.drop(columns='less_toxic_preds')
        if 'more_toxic_preds' in validation.columns:
            validation = validation.drop(columns='more_toxic_preds')
        rename_cols = {"text": 'less_toxic', 'pred': 'less_toxic_preds'}
        validation_less = validation.merge(
            validation_data[["text"]+CFG.toxic_cols].rename(columns=rename_cols), 
            on='less_toxic', 
            how='left'
            )
        rename_cols = {"text": 'more_toxic', 'pred': 'more_toxic_preds'}
        validation_more = validation.merge(
            validation_data[["text"]+CFG.toxic_cols].rename(columns=rename_cols), 
            on='more_toxic', 
            how='left'
            )

        rename_cols = {"text": 'less_toxic', 'pred': 'less_toxic_preds'}
        validation = validation.merge(
            validation_data[["text", 'pred']].rename(columns=rename_cols), 
            on='less_toxic', 
            how='left'
            )
        rename_cols = {"text": 'more_toxic', 'pred': 'more_toxic_preds'}
        validation = validation.merge(
            validation_data[["text", 'pred']].rename(columns=rename_cols), 
            on='more_toxic', 
            how='left'
            )

        # scoring
        score = get_result(validation)

        elapsed = time.time() - start_time
        LOGGER.info(f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  time: {elapsed:.0f}s")
        LOGGER.info(f"Epoch {epoch+1} - Score: {score:.4f}")

        if score > best_score:
            best_score = score
            LOGGER.info(f"Epoch {epoch+1} - Save : {score:.4f} Model")
            torch.save({"model": model.state_dict()},
                        OUTPUT_DIR+f'{CFG.base_model_name}_best_score.pth')
            validation_last["more_toxic_preds"] = validation["more_toxic_preds"]
            validation_last["less_toxic_preds"] = validation["less_toxic_preds"]
    return validation_last, validation_less, validation_more


def main():
    seed_torch(seed=CFG.seed)
    train, validation_data, test, sub = read_processed_data()
    if CFG.debug:
        CFG.epochs = 1
        train = train.sample(n=1000, random_state=CFG.seed).reset_index(drop=True)
    oof_df = validation_data.copy()
    if CFG.train:
        # train 
        oof_more_toxic = np.zeros(len(validation_data))
        oof_less_toxic = np.zeros(len(validation_data))
        _oof_df, validation_less, validation_more = train_loop(train, validation_data)
        oof_more_toxic += (_oof_df["more_toxic_preds"].values)
        oof_less_toxic += (_oof_df["less_toxic_preds"].values)
        get_result(_oof_df)
        # CV result
        LOGGER.info(f"========== CV ==========")
        oof_df["more_toxic_preds"] = oof_more_toxic
        oof_df["less_toxic_preds"] = oof_less_toxic
        get_result(oof_df)
        # save result
        validation_less.to_csv(OUTPUT_DIR+"less_df.csv", index=False)
        validation_more.to_csv(OUTPUT_DIR+"more_df.csv", index=False)

In [10]:
if __name__ == "__main__":
    main()

Sampling: Train
shape: (215920, 9)
                 id  ... target
0  0000997932d777bf  ...      0
1  000103f0d9cfb60f  ...      0
2  000113f07ec002fd  ...      0
3  0001b41b1c6bb37e  ...      0
4  0001d958c54c6e35  ...      0

[5 rows x 9 columns]
Text cleaning...


  0%|          | 0/215920 [00:00<?, ?it/s]

  0%|          | 0/30108 [00:00<?, ?it/s]

  0%|          | 0/30108 [00:00<?, ?it/s]

Train Shape: (215920, 10)
Under Sampling
Valid Shape: (14220, 2)
Epoch: [1][0/1208] Data 0.967 (0.967) Elapsed 0m 1s (remain 32m 42s) Loss: 0.8897(0.8897) LR:  0.000000
Epoch: [1][100/1208] Data 0.000 (0.010) Elapsed 1m 17s (remain 14m 11s) Loss: 0.2621(0.6701) LR:  0.000003
Epoch: [1][200/1208] Data 0.000 (0.005) Elapsed 2m 39s (remain 13m 16s) Loss: 0.1935(0.4518) LR:  0.000006
Epoch: [1][300/1208] Data 0.000 (0.003) Elapsed 4m 0s (remain 12m 4s) Loss: 0.0980(0.3673) LR:  0.000008
Epoch: [1][400/1208] Data 0.000 (0.003) Elapsed 5m 21s (remain 10m 47s) Loss: 0.1432(0.3196) LR:  0.000010
Epoch: [1][500/1208] Data 0.000 (0.002) Elapsed 6m 42s (remain 9m 28s) Loss: 0.1372(0.2882) LR:  0.000010
Epoch: [1][600/1208] Data 0.000 (0.002) Elapsed 8m 4s (remain 8m 8s) Loss: 0.1381(0.2667) LR:  0.000010
Epoch: [1][700/1208] Data 0.000 (0.002) Elapsed 9m 25s (remain 6m 48s) Loss: 0.1770(0.2500) LR:  0.000010
Epoch: [1][800/1208] Data 0.000 (0.001) Elapsed 10m 46s (remain 5m 28s) Loss: 0.0965(0.23

Score: 0.6974
Score: 0.6974
Epoch 1 - avg_train_loss: 0.2082  time: 1310s
Epoch 1 - avg_train_loss: 0.2082  time: 1310s
Epoch 1 - Score: 0.6974
Epoch 1 - Score: 0.6974
Epoch 1 - Save : 0.6974 Model
Epoch 1 - Save : 0.6974 Model


Epoch: [2][0/1208] Data 0.785 (0.785) Elapsed 0m 1s (remain 32m 11s) Loss: 0.2118(0.2118) LR:  0.000008
Epoch: [2][100/1208] Data 0.000 (0.008) Elapsed 1m 23s (remain 15m 12s) Loss: 0.1427(0.1321) LR:  0.000008
Epoch: [2][200/1208] Data 0.000 (0.004) Elapsed 2m 44s (remain 13m 43s) Loss: 0.1959(0.1271) LR:  0.000008
Epoch: [2][300/1208] Data 0.000 (0.003) Elapsed 4m 5s (remain 12m 19s) Loss: 0.1758(0.1304) LR:  0.000007
Epoch: [2][400/1208] Data 0.000 (0.002) Elapsed 5m 26s (remain 10m 57s) Loss: 0.1071(0.1293) LR:  0.000007
Epoch: [2][500/1208] Data 0.000 (0.002) Elapsed 6m 47s (remain 9m 35s) Loss: 0.0992(0.1303) LR:  0.000006
Epoch: [2][600/1208] Data 0.000 (0.001) Elapsed 8m 8s (remain 8m 13s) Loss: 0.1176(0.1298) LR:  0.000006
Epoch: [2][700/1208] Data 0.000 (0.001) Elapsed 9m 29s (remain 6m 51s) Loss: 0.1270(0.1305) LR:  0.000005
Epoch: [2][800/1208] Data 0.000 (0.001) Elapsed 10m 49s (remain 5m 30s) Loss: 0.0815(0.1314) LR:  0.000005
Epoch: [2][900/1208] Data 0.000 (0.001) Elaps

Score: 0.7026
Score: 0.7026
Epoch 2 - avg_train_loss: 0.1323  time: 1313s
Epoch 2 - avg_train_loss: 0.1323  time: 1313s
Epoch 2 - Score: 0.7026
Epoch 2 - Score: 0.7026
Epoch 2 - Save : 0.7026 Model
Epoch 2 - Save : 0.7026 Model


Epoch: [3][0/1208] Data 0.767 (0.767) Elapsed 0m 1s (remain 31m 54s) Loss: 0.0913(0.0913) LR:  0.000003
Epoch: [3][100/1208] Data 0.000 (0.008) Elapsed 1m 22s (remain 15m 7s) Loss: 0.1545(0.1179) LR:  0.000003
Epoch: [3][200/1208] Data 0.000 (0.004) Elapsed 2m 44s (remain 13m 42s) Loss: 0.1553(0.1176) LR:  0.000002
Epoch: [3][300/1208] Data 0.000 (0.003) Elapsed 4m 5s (remain 12m 19s) Loss: 0.0776(0.1154) LR:  0.000002
Epoch: [3][400/1208] Data 0.000 (0.002) Elapsed 5m 26s (remain 10m 56s) Loss: 0.1108(0.1171) LR:  0.000001
Epoch: [3][500/1208] Data 0.000 (0.002) Elapsed 6m 47s (remain 9m 34s) Loss: 0.1714(0.1178) LR:  0.000001
Epoch: [3][600/1208] Data 0.000 (0.001) Elapsed 8m 8s (remain 8m 13s) Loss: 0.1612(0.1167) LR:  0.000001
Epoch: [3][700/1208] Data 0.000 (0.001) Elapsed 9m 29s (remain 6m 51s) Loss: 0.0910(0.1174) LR:  0.000001
Epoch: [3][800/1208] Data 0.000 (0.001) Elapsed 10m 50s (remain 5m 30s) Loss: 0.2125(0.1174) LR:  0.000000
Epoch: [3][900/1208] Data 0.000 (0.001) Elapse

Score: 0.7033
Score: 0.7033
Epoch 3 - avg_train_loss: 0.1179  time: 1313s
Epoch 3 - avg_train_loss: 0.1179  time: 1313s
Epoch 3 - Score: 0.7033
Epoch 3 - Score: 0.7033
Epoch 3 - Save : 0.7033 Model
Epoch 3 - Save : 0.7033 Model
Score: 0.7033
Score: 0.7033
Score: 0.7033
Score: 0.7033


Sampling: Train
shape: (215920, 9)
                 id  ... target
0  0000997932d777bf  ...      0
1  000103f0d9cfb60f  ...      0
2  000113f07ec002fd  ...      0
3  0001b41b1c6bb37e  ...      0
4  0001d958c54c6e35  ...      0

[5 rows x 9 columns]
Text cleaning...


  0%|          | 0/215920 [00:00<?, ?it/s]

  0%|          | 0/30108 [00:00<?, ?it/s]

  0%|          | 0/30108 [00:00<?, ?it/s]

Train Shape: (215920, 10)
Under Sampling
Valid Shape: (14220, 2)
Epoch: [1][0/1208] Data 0.990 (0.990) Elapsed 0m 2s (remain 40m 14s) Loss: 0.8019(0.8019) LR:  0.000000
Epoch: [1][100/1208] Data 0.000 (0.010) Elapsed 1m 23s (remain 15m 15s) Loss: 0.3738(0.5853) LR:  0.000003


KeyboardInterrupt: ignored