In [1]:
import os
import random
import time

import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score

import transformers

from tqdm.notebook import tqdm
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors

import torch
import torch.nn as nn
import torch.utils.data
import torch.nn.functional as F

from torch import Tensor
from torch.optim import *
from torch.nn.modules.loss import *
from torch.optim.lr_scheduler import * 
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import RandomSampler


In [2]:
from transformers import AutoModel, AutoTokenizer

In [3]:
from transformers import XLMRobertaModel, XLMRobertaTokenizer

In [4]:
def seed_everything(seed):
    """
    Seeds basic parameters for reproductibility of results
    
    Arguments:
        seed {int} -- Number of the seed
    """
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [5]:
seed = 2020
seed_everything(seed)

# Load data

In [6]:
train1 = pd.read_csv("/home/maciej/Workspace/toxic/data/jigsaw-toxic-comment-train.csv")
train2 = pd.read_csv("/home/maciej/Workspace/toxic/data/jigsaw-unintended-bias-train.csv")
train2.toxic = train2.toxic.round().astype(int)

valid = pd.read_csv('/home/maciej/Workspace/toxic/data/validation.csv')
test = pd.read_csv('/home/maciej/Workspace/toxic/data/test.csv')
sub = pd.read_csv('/home/maciej/Workspace/toxic/data/sample_submission.csv')

In [7]:
train = pd.concat([
    train1[['comment_text', 'toxic']],
    train2[['comment_text', 'toxic']].query('toxic==1'),
    train2[['comment_text', 'toxic']].query('toxic==0')#.sample(n=100000, random_state=0)
])

# Encode

In [8]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [9]:
def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks=False, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'])

In [10]:
EPOCHS = 2
BATCH_SIZE = 16
MAX_LEN = 192
MODEL = 'xlm-roberta-base'

In [11]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [12]:
%%time 

x_train = regular_encode(train.comment_text.values, tokenizer, maxlen=MAX_LEN)
x_valid = regular_encode(valid.comment_text.values, tokenizer, maxlen=MAX_LEN)
x_test = regular_encode(test.content.values, tokenizer, maxlen=MAX_LEN)

y_train = train.toxic.values
y_valid = valid.toxic.values

CPU times: user 8min 57s, sys: 3.51 s, total: 9min 1s
Wall time: 9min 1s


In [13]:
x_train.shape

(2125743, 192)

In [14]:
y_train.shape

(2125743,)

# Datasets

In [15]:
import torch

class Dataset(torch.utils.data.Dataset):
    """Base class that defines common API for datasets."""

    def __init__(self, x, y=None):
        self.x = x
        self.y = y

    def __getitem__(self, index: int):
        if self.y is not None:
            return self.x[index], self.y[index]
        return self.x[index]

    def __len__(self) -> int:
        return len(self.x)

In [16]:
train_dataset = Dataset(x_train, y_train)

In [17]:
valid_dataset = Dataset(x_valid, y_valid)

In [18]:
test_dataset = Dataset(x_test)

# Transformer

In [19]:
class Transformer(nn.Module):
    def __init__(self, model, num_classes=1):
        """
        Constructor
        
        Arguments:
            model {string} -- Transformer to build the model on. Expects "camembert-base".
            num_classes {int} -- Number of classes (default: {1})
        """
        super().__init__()
        
        self.transformer = model

        self.nb_features = self.transformer.pooler.dense.out_features

        self.pooler = nn.Sequential(
            nn.Linear(self.nb_features, self.nb_features), 
            nn.Tanh(),
        )

        self.logit = nn.Linear(self.nb_features, num_classes)

    def forward(self, tokens):
        """
        Usual torch forward function
        
        Arguments:
            tokens {torch tensor} -- Sentence tokens
        
        Returns:
            torch tensor -- Class logits
        """
        hidden_states, _ = self.transformer(
            tokens, attention_mask=(tokens > 0).long()
        )

        hidden_states = hidden_states[:, 0] # Use the representation of the first token of the last layer

        ft = self.pooler(hidden_states)

        return self.logit(ft)

In [20]:
def fit(model, train_dataset, val_dataset, epochs=1, batch_size=32, warmup_prop=0, lr=5e-5):
    device = 'cuda' #xm.xla_device()
    model.to(device)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    
    num_warmup_steps = int(warmup_prop * epochs * len(train_loader))
    num_training_steps = epochs * len(train_loader)
    
    scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)

    loss_fct = nn.BCEWithLogitsLoss(reduction='mean').to(device)
    
    for epoch in range(epochs):
        model.train()
        start_time = time.time()
        
        optimizer.zero_grad()
        avg_loss = 0
        
        for step, (x, y_batch) in tqdm(enumerate(train_loader), total=len(train_loader)): 
            y_pred = model(x.to(device))
            
            loss = loss_fct(y_pred.view(-1).float(), y_batch.float().to(device))
            loss.backward()
            avg_loss += loss.item() / len(train_loader)

            optimizer.step()
            scheduler.step()
            model.zero_grad()
            optimizer.zero_grad()
            print('{step}/{total}'.format(step=step, total=len(train_loader)))
                
        model.eval()
        preds = []
        truths = []
        avg_val_loss = 0.

        with torch.no_grad():
            for x, y_batch in val_loader:                
                y_pred = model(x.to(device))
                loss = loss_fct(y_pred.detach().view(-1).float(), y_batch.float().to(device))
                avg_val_loss += loss.item() / len(val_loader)
                
                probs = torch.sigmoid(y_pred).detach().cpu().numpy()
                preds += list(probs.flatten())
                truths += list(y_batch.numpy().flatten())
            score = roc_auc_score(truths, preds)
            
        
        dt = time.time() - start_time
        lr = scheduler.get_last_lr()[0]
        print('Epoch {epoch}/{epochs} \t lr={lr} \t t={dt}s \t loss={avg_loss} \t val_loss={avg_val_loss} \t val_auc={score}'.format(
            epoch=epoch,
            epochs=epochs,
            lr=lr,
            dt=dt,
            avg_loss=avg_loss,
            avg_val_loss=avg_val_loss,
            score=score
        ))
        #print(f'Epoch {epoch + 1}/{epochs} \t lr={lr:.1e} \t t={dt:.0f}s \t loss={avg_loss:.4f} \t val_loss={avg_val_loss:.4f} \t val_auc={score:.4f}')

In [21]:
epochs = 1 # 1 epoch seems to be enough
batch_size = 16
warmup_prop = 0.1
lr = 2e-5  # Important parameter to tweak

In [22]:
#transformer = AutoModel.from_pretrained(MODEL)

In [23]:
#classifier = Transformer(transformer)

In [24]:
#fit(classifier, train_dataset, valid_dataset, epochs=epochs, batch_size=batch_size, warmup_prop=warmup_prop, lr=lr)

In [25]:
#classifier.transformer.save_pretrained('model/first_model')

In [26]:
def predict(model, dataset, batch_size=16):
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    model.eval()
    preds = []
    truths = []
    avg_val_loss = 0.

    with torch.no_grad():
        for _, x in tqdm(enumerate(loader), total=len(loader)):                
            y_pred = model(x.to('cuda'))
            probs = torch.sigmoid(y_pred).detach().cpu().numpy()
            preds += list(probs.flatten())
    return preds

In [27]:
transformer = AutoModel.from_pretrained('./model/first_model')

In [28]:
classifier = Transformer(transformer)

In [29]:
classifier.to('cuda')

Transformer(
  (transformer): XLMRobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=

In [30]:
test

Unnamed: 0,id,content,lang
0,0,Doctor Who adlı viki başlığına 12. doctor olar...,tr
1,1,"Вполне возможно, но я пока не вижу необходимо...",ru
2,2,"Quindi tu sei uno di quelli conservativi , ...",it
3,3,Malesef gerçekleştirilmedi ancak şöyle bir şey...,tr
4,4,:Resim:Seldabagcan.jpg resminde kaynak sorunu ...,tr
5,5,Le truc le plus important dans ta tirade c est...,fr
6,6,"20px Caro editor, encontramos problemas na edi...",pt
7,7,el skate es unos de los deportes favoritos de ...,es
8,8,Me doy la bienvenida. A este usuari le gusta c...,es
9,9,"ES NOTABLEMENTE TENDENCIOSO, NO SE HABLA DE CU...",es


In [31]:
preds = predict(classifier, test_dataset)

HBox(children=(FloatProgress(value=0.0, max=3989.0), HTML(value='')))




In [32]:
classifier(test_dataset[0])

AttributeError: 'numpy.ndarray' object has no attribute 'long'

In [33]:
len(test)

63812

In [41]:
sub['toxic'] = preds

In [43]:
sub.to_csv('preds.csv')

In [36]:
preds

[0.041890193,
 0.04364335,
 0.061732296,
 0.036378905,
 0.037885625,
 0.046357226,
 0.04339461,
 0.037015777,
 0.35728034,
 0.17888205,
 0.036505077,
 0.20173225,
 0.82753825,
 0.049571786,
 0.29557827,
 0.063967206,
 0.038357083,
 0.03453088,
 0.03396784,
 0.07432015,
 0.046146777,
 0.03714179,
 0.043859903,
 0.29813716,
 0.036270916,
 0.14713489,
 0.04335147,
 0.036112223,
 0.29121426,
 0.041792236,
 0.123881586,
 0.037975606,
 0.066351995,
 0.040454026,
 0.040251948,
 0.043759555,
 0.044165492,
 0.03577595,
 0.13794467,
 0.039145913,
 0.037136607,
 0.036153287,
 0.14240599,
 0.05101105,
 0.041368492,
 0.037966937,
 0.050588634,
 0.043965537,
 0.046192583,
 0.07301474,
 0.31763434,
 0.04152216,
 0.36518833,
 0.035191048,
 0.041350797,
 0.0458616,
 0.0439705,
 0.744836,
 0.03986522,
 0.06365075,
 0.25605536,
 0.04288899,
 0.08948533,
 0.049614873,
 0.31162903,
 0.041092165,
 0.0438178,
 0.04448736,
 0.04199287,
 0.09095582,
 0.21469893,
 0.03612182,
 0.30643794,
 0.037645206,
 0.03953