In [None]:
import os
import random
import time
import shutil

import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score

import transformers

from tqdm.notebook import tqdm
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors

import torch
import torch.nn as nn
import torch.utils.data
import torch.nn.functional as F

from torch import Tensor
from torch.optim import *
from torch.nn.modules.loss import *
from torch.optim.lr_scheduler import * 
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import RandomSampler


In [None]:
from transformers import AutoModel, AutoTokenizer

In [None]:
from transformers import XLMRobertaModel, XLMRobertaTokenizer

In [None]:
def seed_everything(seed):
    """
    Seeds basic parameters for reproductibility of results
    
    Arguments:
        seed {int} -- Number of the seed
    """
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
seed = 2020
seed_everything(seed)

In [None]:
def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks=False, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'])

In [None]:
class Dataset(torch.utils.data.Dataset):
    """Base class that defines common API for datasets."""

    def __init__(self, x, y=None):
        self.x = x
        self.y = y

    def __getitem__(self, index: int):
        if self.y is not None:
            return self.x[index], self.y[index]
        return self.x[index]

    def __len__(self) -> int:
        return len(self.x)

In [None]:
def get_data(tokenizer, nrows, max_len, data_cache_dir, overwrite):
    loaded_cache = False
    if os.path.exists(data_cache_dir):
        if overwrite:
            shutil.rmtree(data_cache_dir)
        else:
            x_train = np.load(os.path.join(data_cache_dir, "x_train"))
            x_valid = np.load(os.path.join(data_cache_dir, "x_valid"))
            x_test = np.load(os.path.join(data_cache_dir, "x_test"))
            y_train = np.load(os.path.join(data_cache_dir, "y_train"))
            y_valid = np.load(os.path.join(data_cache_dir, "y_valid"))
            loaded_cache = True
        
    if not loaded_cache:
        train1 = pd.read_csv("../input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv", nrows=nrows)
        train2 = pd.read_csv("../input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv", nrows=nrows)
        train2.toxic = train2.toxic.round().astype(int)

        valid = pd.read_csv('../input/jigsaw-multilingual-toxic-comment-classification/validation.csv', nrows=nrows)
        test = pd.read_csv('../input/jigsaw-multilingual-toxic-comment-classification/test.csv', nrows=nrows)
        if nrows is None:
            train = pd.concat([
                train1[['comment_text', 'toxic']],
                train2[['comment_text', 'toxic']].query('toxic==1'),
                train2[['comment_text', 'toxic']].query('toxic==0').sample(n=100000, random_state=0)
            ])
        else:
            train = pd.concat([
                train1[['comment_text', 'toxic']],
                train2[['comment_text', 'toxic']].query('toxic==1'),
                train2[['comment_text', 'toxic']].query('toxic==0')
            ])

        x_train = regular_encode(train.comment_text.values, tokenizer, maxlen=max_len)
        x_valid = regular_encode(valid.comment_text.values, tokenizer, maxlen=max_len)
        x_test = regular_encode(test.content.values, tokenizer, maxlen=max_len)

        y_train = train.toxic.values
        y_valid = valid.toxic.values
        
        os.makedirs(data_cache_dir)
        
        np.save(os.path.join(data_cache_dir, "x_train"), x_train)
        np.save(os.path.join(data_cache_dir, "x_valid"), x_valid)
        np.save(os.path.join(data_cache_dir, "x_test"), x_test)
        np.save(os.path.join(data_cache_dir, "y_train"), y_train)
        np.save(os.path.join(data_cache_dir, "y_valid"), y_valid)
    
    train_dataset = Dataset(x_train, y_train)
    valid_dataset = Dataset(x_valid, y_valid)
    test_dataset = Dataset(x_test)
    
    return train_dataset, valid_dataset, test_dataset

# Transformer

In [None]:
class Classifier(nn.Module):
    def __init__(self, transformer, num_classes=1):
        """
        Constructor
        
        Arguments:
            model {string} -- Transformer to build the model on. Expects "camembert-base".
            num_classes {int} -- Number of classes (default: {1})
        """
        super().__init__()
        
        self.transformer = transformer

        self.nb_features = self.transformer.pooler.dense.out_features

        self.pooler = nn.Sequential(
            nn.Linear(self.nb_features, self.nb_features), 
            nn.Tanh(),
        )

        self.logit = nn.Linear(self.nb_features, num_classes)

    def forward(self, tokens):
        """
        Usual torch forward function
        
        Arguments:
            tokens {torch tensor} -- Sentence tokens
        
        Returns:
            torch tensor -- Class logits
        """
        hidden_states, _ = self.transformer(
            tokens, attention_mask=(tokens > 0).long()
        )

        hidden_states = hidden_states[:, 0] # Use the representation of the first token of the last layer

        ft = self.pooler(hidden_states)

        return self.logit(ft)

In [None]:
def fit(model, train_dataset, val_dataset, epochs=1, batch_size=32, warmup_prop=0, lr=5e-5):
    device = 'cuda' #xm.xla_device()
    model.to(device)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    
    num_warmup_steps = int(warmup_prop * epochs * len(train_loader))
    num_training_steps = epochs * len(train_loader)
    
    scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)

    loss_fct = nn.BCEWithLogitsLoss(reduction='mean').to(device)
    
    for epoch in range(epochs):
        model.train()
        start_time = time.time()
        
        optimizer.zero_grad()
        avg_loss = 0
        
        for step, (x, y_batch) in tqdm(enumerate(train_loader), total=len(train_loader)): 
            y_pred = model(x.to(device))
            
            loss = loss_fct(y_pred.view(-1).float(), y_batch.float().to(device))
            loss.backward()
            avg_loss += loss.item() / len(train_loader)

            optimizer.step()
            scheduler.step()
            model.zero_grad()
            optimizer.zero_grad()
            print('{step}/{total}'.format(step=step, total=len(train_loader)))
                
        model.eval()
        preds = []
        truths = []
        avg_val_loss = 0.

        with torch.no_grad():
            for x, y_batch in val_loader:                
                y_pred = model(x.to(device))
                loss = loss_fct(y_pred.detach().view(-1).float(), y_batch.float().to(device))
                avg_val_loss += loss.item() / len(val_loader)
                
                probs = torch.sigmoid(y_pred).detach().cpu().numpy()
                preds += list(probs.flatten())
                truths += list(y_batch.numpy().flatten())
            score = roc_auc_score(truths, preds)
            
        
        dt = time.time() - start_time
        lr = scheduler.get_last_lr()[0]
        print('Epoch {epoch}/{epochs} \t lr={lr} \t t={dt}s \t loss={avg_loss} \t val_loss={avg_val_loss} \t val_auc={score}'.format(
            epoch=epoch,
            epochs=epochs,
            lr=lr,
            dt=dt,
            avg_loss=avg_loss,
            avg_val_loss=avg_val_loss,
            score=score
        ))
        #print(f'Epoch {epoch + 1}/{epochs} \t lr={lr:.1e} \t t={dt:.0f}s \t loss={avg_loss:.4f} \t val_loss={avg_val_loss:.4f} \t val_auc={score:.4f}')

In [None]:
def get_model(
        train_dataset, 
        val_dataset, 
        model_cache_dir, 
        overwrite, 
        transformer_type, 
        epochs=1, 
        batch_size=32, 
        warmup_prop=0, 
        lr=5e-5
    ):
    classifier = Classifier(AutoModel.from_pretrained(transformer_type))
    loaded_cache = False
    if os.path.exists(model_cache_dir):
        if overwrite:
            shutil.rmtree(data_cache_dir)
        else:
            classifier.load_state_dict(model_cache_dir)
            loaded_cache = True
    if not loaded_cache:
        fit(classifier, train_dataset, val_dataset, epochs=1, batch_size=32, warmup_prop=0, lr=5e-5)
        os.makedirs(model_cache_dir)
        classifier.save_state_dict(model_cache_dir)
    return classfier

In [None]:
def predict(model, dataset, batch_size=16):
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    model.eval()
    preds = []
    truths = []
    avg_val_loss = 0.

    with torch.no_grad():
        for _, x in tqdm(enumerate(loader), total=len(loader)):                
            y_pred = model(x.to('cuda'))
            probs = torch.sigmoid(y_pred).detach().cpu().numpy()
            preds += list(probs.flatten())
            
    sub = pd.read_csv('../input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv', nrows=nrows)
    sub['toxic'] = preds
    sub.to_csv('submission.csv', index=False)
    return preds

# Script

In [None]:
nrows = 10
epochs = 1
batch_size = 16
warmup_prop = 0
lr = 2e-5
max_len = 192
MODEL = 'xlm-roberta-base'
device = 'cuda'
data_cache_dir = 'data_cache'
data_overwrite = True
model_cache_dir = 'model_cache'
model_overwrite = True

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [None]:
train_dataset, valid_dataset, test_dataset = get_data(tokenizer, nrows, max_len, data_cache_dir, data_overwrite)

In [None]:
classifier = get_model(
    train_dataset, 
    valid_dataset, 
    model_cache_dir, 
    model_overwrite,
    MODEL, 
    epochs=epochs, 
    batch_size=batch_size,
    warmup_prop=warmup_prop, 
    lr=lr
)

In [None]:
classifier = Classifier(AutoModel.from_pretrained(MODEL))

In [None]:
model = AutoTokenizer.from_pretrained(MODEL)

In [None]:
model

In [None]:
classifier

In [None]:
predict(model, dataset, batch_size=16)