In [37]:
import os
import pandas as pd
import numpy as np 
from tqdm import tqdm
from functools import reduce
from copy import deepcopy
import matplotlib.pyplot as plt

import re
import torch
from nltk import tokenize
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from transformers import (AutoTokenizer,
                          AutoModelForSequenceClassification,
                          get_linear_schedule_with_warmup,
                          logging)
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

os.environ["TOKENIZERS_PARALLELISM"]="true"
logging.set_verbosity_error()

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [38]:
TARGET_COLUMNS = ["toxic",
                  "severe_toxic",
                  "obscene",
                  "threat",
                  "insult",
                  "identity_hate"]

In [39]:
def remove_special_chars(text):
    if isinstance(text, str):
        # Remove special characters
        text = re.sub(r'[^\w\s]','', text) 
        # Remove non-ASCII characters (such as emojis)
        text = re.sub(r'[^\x00-\x7F]+', '', text) 
        text = " ".join(tokenize.sent_tokenize(text))
        text = re.sub(r"http\S+", "", text)
        text = re.sub(r"\n+", ". ", text)
        for symb in ["!", ",", ":", ";", "?"]:
            text = re.sub(rf"\{symb}\.", symb, text)
        text = re.sub("[^а-яА-Яa-zA-Z0-9!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ё]+", " ", text)
        text = re.sub(r"#\S+", "", text)
        text = text.strip()
    return text

def remove_sequential_dots(text):
    # Collapse sequential dots
    text = re.sub("\.+", ".", text)
    # Collapse dots separated by whitespaces
    all_collapsed = False
    while not all_collapsed:
        output = re.sub(r"\.(( )*)\.", ".", text)
        all_collapsed = text == output
        text = output
    return output

def remove_stop_words(text, stop_words):
    tokens = text.split()
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

def preprocess_text(text, stop_words=stopwords.words('english')):
    text = remove_special_chars(text)
    text = remove_sequential_dots(text)
    text = remove_stop_words(text, stop_words)
    return text

In [40]:
def preprocess(train,
               test,
               test_labels,
               max_text_length = 512,
               sample=1):
    test = test.merge(test_labels, on="id")
    
    train = train.sample(int(train.shape[0]*sample), random_state=69)
    test = test.sample(int(test.shape[0]*sample), random_state=69)
    
    #closed_test = test[(test[TARGET_COLUMNS] == -1).any(axis=1)].reset_index(drop=True)
    test = test[~(test[TARGET_COLUMNS] == -1).any(axis=1)].reset_index(drop=True)
    
    train['clean_comment_text'] = train['comment_text'].apply(preprocess_text)
    test['clean_comment_text'] = test['comment_text'].apply(preprocess_text)
    
    train['clean_comment_text'] = train['clean_comment_text'].str.slice(0, max_text_length)
    test['clean_comment_text'] = test['clean_comment_text'].str.slice(0, max_text_length)
    
    return train, test

In [41]:
def comp_metric(y_true, y_pred, verbose=1):
    assert y_true.shape == y_pred.shape
    class_roc_aucs = [roc_auc_score(y_true[:,i], y_pred[:,i]) for i in range(y_pred.shape[1])]
    if verbose:
        for ra, tgt_col in zip(class_roc_aucs, TARGET_COLUMNS):
            if verbose > 1:
                print(f"{tgt_col} Roc Auc: {ra}")
        print(f"Result Roc Auc: {np.mean(class_roc_aucs)}")
    return class_roc_aucs, np.mean(class_roc_aucs)

def print_losses(input, verbose=1):
    if verbose:
        for cls_idx, cls_name in enumerate(TARGET_COLUMNS):
            if verbose > 1:
                print(f"{cls_name} BCE loss: {input[:,cls_idx].mean()}")
        print(f"Result BCE loss: {input.mean()}")

In [42]:
class TextDataset(torch.utils.data.Dataset):
    def __init__(
        self,
        texts,
        targets,
        dataset_tokenizer,
        max_length,
        trim_policy="random"
    ):
        self.targets = targets
        self.tokenized_texts =  dataset_tokenizer(texts)
        self.tokenizer = dataset_tokenizer
        
        self.max_length = max_length
        if trim_policy not in ["random", "first"]:
            raise ValueError(f"{trim_policy} is not valid trim_policy")
        self.trim_policy = trim_policy
        
    def select_text_subsequance(self, input):
        input_len = len(input["input_ids"])
        if input_len < self.max_length:
            pad_len = self.max_length - input_len
            return {
                "input_ids": input["input_ids"] + [self.tokenizer.pad_token_id] * pad_len,
                "attention_mask": input["attention_mask"] + [0] * pad_len
            } 
        elif input_len > self.max_length:
            if self.trim_policy == "random":
                start = np.random.randint(0, input_len - self.max_length)
            elif self.trim_policy == "first":
                start = 0
            return {
                "input_ids": input["input_ids"][start : start + self.max_length - 1] + [self.tokenizer.sep_token_id] ,
                "attention_mask": input["attention_mask"][start : start + self.max_length]
            }
        else: 
            return input
        
    def __getitem__(self, idx):
        tokenized = {k:v[idx] for k,v in self.tokenized_texts.items()}
        tokenized = self.select_text_subsequance(tokenized)
        tokenized = {k:torch.LongTensor(v) for k,v in tokenized.items()}
        tokenized["target"] = torch.from_numpy(self.targets[idx]).float()
        return tokenized
    
    def __len__(self):
        return len(self.targets)

In [43]:
class Model:
    def __init__(self, train_dataset, test_dataset):
        # Datasets
        self.train_dataset = train_dataset
        self.test_dataset = test_dataset
        
        # Model
        self.model = {}
        self.is_built = False
        self.is_trained = False
        self.n_epochs = None
        self.train_torch_dataloader = {}
        self.valid_torch_dataloader = {}
        self.test_torch_dataloader = {}
        self.tokenizer = {}
        self.optimizer = {}
        self.criterion = {}
        self.scheduler = {}
        self.batch_size = None
        self.max_length = None
        self.gradient_clipping = None
        self.lr_update = None
        self.num_workers = None
        self.device = ""
        
        # Train results
        self.train_all_epoch_labels = []
        self.train_all_epoch_losses = []
        self.train_all_epoch_targets = []
        self.valid_all_epoch_labels = []
        self.valid_all_epoch_losses = []
        self.valid_all_epoch_targets = []
        self.valid_roc_aucs = []
        self.train_roc_aucs = []
        self.best_metric = - np.inf
        self.best_model_state_dict = None
        
    # Prepare evething and initialize the model
    def build(self,
              tokenizer_name='distilbert-base-uncased',
              model_name='distilbert-base-uncased',
              max_length=256,
              batch_size=32,
              FOLDS=5,
              n_epochs=5,
              gradient_clipping=False,
              amsgrad=True,
              model_lr=1e-5,
              classifier_lr=1e-3,
              lr_update=False,
              weight_decay=0,
              num_workers=2,
              device="cuda"):
        
        self.is_built = True
        self.batch_size = batch_size
        self.n_epochs = n_epochs
        self.max_length = max_length
        self.lr_update = lr_update
        self.gradient_clipping = gradient_clipping
        self.num_workers = num_workers
        self.device = device
        
        # Create stritified target from target columns
        train["stratified_target"] = train[TARGET_COLUMNS].apply(
            lambda x: reduce(lambda x, y: str(x) + str(y), x), axis=1)
        small_groups = train["stratified_target"].value_counts()[
            train["stratified_target"].value_counts() < FOLDS].index
        train.loc[train["stratified_target"].isin(small_groups), "stratified_target"] = "-1"
        
        # Cross-validation
        stratifier = StratifiedKFold(n_splits=FOLDS,
                                     random_state=69,
                                     shuffle=True)
        folds_ids = [el for el in stratifier.split(train,
                                                   train["stratified_target"])]
        
        # Tokenize text and create dataloader
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        train_torch_dataset = TextDataset(
            texts=train.iloc[folds_ids[0][0]]["clean_comment_text"].to_list(),
            targets=train.iloc[folds_ids[0][0]][TARGET_COLUMNS].values,
            dataset_tokenizer=self.tokenizer,
            max_length=self.max_length,
        )
        self.train_torch_dataloader = torch.utils.data.DataLoader(
            train_torch_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            drop_last=True,
            num_workers=self.num_workers,
            pin_memory=True
        )
        
        valid_torch_dataset = TextDataset(
            texts=train.iloc[folds_ids[0][1]]["clean_comment_text"].to_list(),
            targets=train.iloc[folds_ids[0][1]][TARGET_COLUMNS].values,
            dataset_tokenizer=self.tokenizer,
            max_length=self.max_length,
            trim_policy="first"
        )
        self.valid_torch_dataloader = torch.utils.data.DataLoader(
            valid_torch_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            drop_last=False,
            num_workers=self.num_workers,
            pin_memory=True
        )
        
        # Initialize model
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=len(TARGET_COLUMNS), 
            ignore_mismatched_sizes=True,
            max_length=self.max_length
        ).to(device)
        
        # Hyperparameters
        self.criterion = torch.nn.BCEWithLogitsLoss(reduction="none")
        self.optimizer = torch.optim.AdamW([
            {'params': self.model.distilbert.parameters(), "lr": model_lr},
            {'params': self.model.classifier.parameters(), "lr": classifier_lr},],
            weight_decay=weight_decay,
            amsgrad=amsgrad
        )
        self.scheduler = get_linear_schedule_with_warmup(
            optimizer=self.optimizer,
            num_warmup_steps=int(0.05*len(self.train_torch_dataloader) * self.n_epochs),
            num_training_steps=len(self.train_torch_dataloader) * self.n_epochs
        )
        return self
    
    def train(self, verbose=1):
        if not self.is_built:
            raise Exception('The model was not build and cannot be trained!')
        
        self.is_trained = True
        for epoch in range(self.n_epochs):
            # 1.1 Iterate over all train dataset and update model weights
            if verbose:
                print(f"Starting Epoch {epoch + 1}")
                print("Train phase")
            (train_epoch_labels,
            train_epoch_losses,
            train_epoch_targets) = self.torch_loop( 
                mode="train"
            )
            # 1.2 Compute and print train metrics
            if verbose:
                print("Train metrics")
            _, train_roc_auc = comp_metric(
                train_epoch_targets, 
                train_epoch_labels,
                verbose=verbose
            )
            if verbose:
                print("Train BCE losses")
            print_losses(train_epoch_losses)
            # 2.1 Iterate over all valid dataset and compute predictions
            if verbose:
                print("Valid phase")
            (valid_epoch_labels,
            valid_epoch_losses,
            valid_epoch_targets) = self.torch_loop(
                validation=True, 
                mode="eval"
            )
            # 2.2 Compute and print valid metrics
            if verbose:
                print("Valid metrics")
            _, valid_roc_auc = comp_metric(
                valid_epoch_targets, 
                valid_epoch_labels,
                verbose=verbose
            )
            if verbose:
                print("Valid BCE losses")
            print_losses(valid_epoch_losses, verbose=verbose)
            # 3. Update learning rate
            if self.lr_update:
                self.scheduler.step(valid_roc_auc)
            # 4. Save best model
            if valid_roc_auc > self.best_metric:
                self.best_metric = valid_roc_auc
                self.best_model_state_dict = deepcopy(self.model.state_dict())
            # 5. Accumulate all stats  
            self.train_all_epoch_labels.append(train_epoch_labels)
            self.train_all_epoch_losses.append(train_epoch_losses)
            self.train_all_epoch_targets.append(train_epoch_targets)
            self.valid_all_epoch_labels.append(valid_epoch_labels)
            self.valid_all_epoch_losses.append(valid_epoch_losses)
            self.valid_all_epoch_targets.append(valid_epoch_targets)
            self.valid_roc_aucs.append(valid_roc_auc)
            self.train_roc_aucs.append(train_roc_auc)
        return self
    
    def torch_loop(self,
                   validation=False,
                   mode="train"):
        if mode == "train":
            self.model.train()
            dataloader = self.train_torch_dataloader
        else:
            self.model.eval()
            if validation:
                dataloader = self.valid_torch_dataloader
            else:
                dataloader = self.test_torch_dataloader
    
        all_predicted_label = []
        all_losses = []
        all_targets = []
        with torch.inference_mode(mode=(mode != "train")):
            for text in tqdm(dataloader):
                text = {k:v.to(self.device) for k,v in text.items()}
                label = text.pop("target")
                if mode == "train":
                    self.optimizer.zero_grad()
                predicted_label = self.model(**text).logits
                loss = self.criterion(predicted_label, label)
                if mode == "train":
                    loss.mean().backward()
                    if self.gradient_clipping:
                        torch.nn.utils.clip_grad_norm_(self.model.parameters(), 0.1)
                    self.optimizer.step()
                    if self.scheduler is not None:
                        self.scheduler.step()

                all_predicted_label.append(torch.sigmoid(predicted_label.detach()).cpu().numpy())
                all_losses.append(loss.detach().cpu().numpy())
                all_targets.append(label.detach().cpu().numpy())
        all_predicted_label = np.concatenate(all_predicted_label)
        all_losses = np.concatenate(all_losses)
        all_targets = np.concatenate(all_targets)

        return all_predicted_label, all_losses, all_targets
    
    def evaluate(self, verbose=1):
        if not self.is_trained:
            raise Exception('The model was not trained and cannot be evaluated!')
            return self
        
        # Load best model
        self.model.load_state_dict(self.best_model_state_dict)
        test_torch_dataset = TextDataset(
            texts=test["clean_comment_text"].to_list(),
            targets=test[TARGET_COLUMNS].values,
            dataset_tokenizer=self.tokenizer,
            max_length=self.max_length,
            trim_policy="first"
        )
        self.test_torch_dataloader = torch.utils.data.DataLoader(
            test_torch_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            drop_last=False,
            num_workers=self.num_workers,
            pin_memory=True
        )
        test_labels, test_losses, test_targets = self.torch_loop( 
            mode="eval"
        )
        print("Test metrics")
        comp_metric(
            test_targets, 
            test_labels,
            verbose=verbose
        )
        
        print("Test BCE losses")
        print_losses(test_losses, verbose=verbose)
        
        return comp_metric(test_targets,
                           test_labels,
                           verbose=verbose)

In [44]:
train = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
test = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test.csv")
test_labels = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv")

parameters = {'Sample':[1],'max_length':[512],
              'n_epochs':[5],'batch_size':[32],
              'gradient_clipping':[False],
              'lr_update':[False],'model_lr':[1e-5],
              'classifier_lr':[1e-3], 'weight_decay':[0],
              'folds':[5]}

if os.path.exists('results.csv'):
    results = pd.read_csv("results.csv")
else:
    columns = list(parameters.keys()).append('ROC AUC')
    results = pd.DataFrame(columns=columns)
    
train, test = preprocess(train,
                         test,
                         test_labels,
                         parameters['max_length'][0],
                         parameters['Sample'][0])

model = Model(train, test)
model = model.build(max_length=parameters['max_length'][0],
                    n_epochs=parameters['n_epochs'][0],
                    batch_size=parameters['batch_size'][0],
                    gradient_clipping=parameters['gradient_clipping'][0],
                    lr_update=parameters['lr_update'][0],
                    model_lr=parameters['model_lr'][0],
                    classifier_lr=parameters['classifier_lr'][0],
                    weight_decay=parameters['weight_decay'][0],
                    FOLDS=parameters['folds'][0],
                    device="cuda"
)
model = model.train()
_, roc_auc = model.evaluate()

parameters.update({'ROC AUC': [roc_auc]})
if results.empty:
    results = pd.DataFrame(parameters)
elif not results.isin(parameters).all(axis=1).any():
    results = results.append(parameters, ignore_index=True)
results.to_csv("results.csv", index=False)

Starting Epoch 1
Train phase


100%|██████████| 3989/3989 [54:03<00:00,  1.23it/s]


Train metrics
Result Roc Auc: 0.9513172580755968
Train BCE losses
Result BCE loss: 0.07202502340078354
Valid phase


100%|██████████| 998/998 [04:19<00:00,  3.85it/s]


Valid metrics
Result Roc Auc: 0.9861557119488987
Valid BCE losses
Result BCE loss: 0.04262630268931389
Starting Epoch 2
Train phase


100%|██████████| 3989/3989 [54:02<00:00,  1.23it/s]


Train metrics
Result Roc Auc: 0.9886249587017906
Train BCE losses
Result BCE loss: 0.0385504812002182
Valid phase


100%|██████████| 998/998 [04:19<00:00,  3.85it/s]


Valid metrics
Result Roc Auc: 0.9877960884632588
Valid BCE losses
Result BCE loss: 0.04103647172451019
Starting Epoch 3
Train phase


100%|██████████| 3989/3989 [54:02<00:00,  1.23it/s]


Train metrics
Result Roc Auc: 0.9923040302213552
Train BCE losses
Result BCE loss: 0.033617328852415085
Valid phase


100%|██████████| 998/998 [04:19<00:00,  3.85it/s]


Valid metrics
Result Roc Auc: 0.9874316687233299
Valid BCE losses
Result BCE loss: 0.04154268652200699
Starting Epoch 4
Train phase


100%|██████████| 3989/3989 [54:05<00:00,  1.23it/s]


Train metrics
Result Roc Auc: 0.9942330309021455
Train BCE losses
Result BCE loss: 0.02994224801659584
Valid phase


100%|██████████| 998/998 [04:18<00:00,  3.86it/s]


Valid metrics
Result Roc Auc: 0.9869368512107836
Valid BCE losses
Result BCE loss: 0.042650774121284485
Starting Epoch 5
Train phase


100%|██████████| 3989/3989 [54:02<00:00,  1.23it/s]


Train metrics
Result Roc Auc: 0.995350926658169
Train BCE losses
Result BCE loss: 0.027291186153888702
Valid phase


100%|██████████| 998/998 [04:18<00:00,  3.86it/s]


Valid metrics
Result Roc Auc: 0.9862750219506132
Valid BCE losses
Result BCE loss: 0.04389544576406479


100%|██████████| 2000/2000 [08:38<00:00,  3.86it/s]


Test metrics
Result Roc Auc: 0.9827018158077921
Test BCE losses
Result BCE loss: 0.0730869472026825
Result Roc Auc: 0.9827018158077921


In [45]:
results

Unnamed: 0,Sample,max_length,n_epochs,batch_size,gradient_clipping,lr_update,model_lr,classifier_lr,weight_decay,folds,ROC AUC
0,1,512,5,32,False,False,1e-05,0.001,0,5,0.982702


In [46]:
print("a")

a
