In [None]:
!pip install transformers --quiet

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import defaultdict
import copy


In [None]:
df = pd.read_csv('../input/toxic-comments-classification-apdl-2021/train_data.csv')
test_df = pd.read_csv('../input/toxic-comments-classification-apdl-2021/test_data.csv')
print(df.columns)
print(df.shape)
target_col= df.columns[1]
feature_col= df.columns[0]
df.head()


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler

from transformers import (AutoTokenizer, AutoModel, 
                          AutoModelForSequenceClassification, 
                          DataCollatorWithPadding, AdamW, get_scheduler,
                          get_linear_schedule_with_warmup, get_constant_schedule_with_warmup
                          )

import pyarrow as pa
from tqdm.auto import tqdm
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import datasets
import random
from sklearn.metrics import classification_report

In [None]:
seed_value = 42
random.seed(seed_value)
np.random.seed(seed_value)
torch.manual_seed(seed_value)
torch.cuda.manual_seed_all(seed_value)

In [None]:
train_df, val_df = train_test_split(df, test_size=0.15, random_state=seed_value)
print(len(train_df))
print(len(val_df))
print(len(test_df))

In [None]:
train_df.reset_index(inplace=True)
train_df.drop("index", axis=1, inplace=True)

val_df.reset_index(inplace=True)
val_df.drop("index", axis=1, inplace=True)

test_df.reset_index(inplace=True)
test_df.drop("index", axis=1, inplace=True)

In [None]:
checkpoint = "DeepPavlov/distilrubert-base-cased-conversational"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
train_tokens = tokenizer.batch_encode_plus(train_df["comment"].tolist(),
                                           max_length = 200,
                                           padding=True,
                                           truncation=True,
                                           return_token_type_ids=False
                                           )

val_tokens = tokenizer.batch_encode_plus(val_df["comment"].tolist(),
                                         max_length = 200,
                                         padding=True,
                                         truncation=True,
                                         return_token_type_ids=False
                                         )

test_tokens = tokenizer.batch_encode_plus(test_df["comment"].tolist(),
                                          max_length = 200,
                                          padding=True,
                                          truncation=True,
                                          return_token_type_ids=False
                                          )

In [None]:
train_seq = torch.tensor(train_tokens['input_ids'])
train_mask = torch.tensor(train_tokens['attention_mask'])
train_y = torch.tensor(train_df.toxic.to_numpy()).unsqueeze(-1)

val_seq = torch.tensor(val_tokens['input_ids'])
val_mask = torch.tensor(val_tokens['attention_mask'])
val_y = torch.tensor(val_df.toxic.to_numpy()).unsqueeze(-1)

test_seq = torch.tensor(test_tokens['input_ids'])
test_mask = torch.tensor(test_tokens['attention_mask'])


In [None]:
train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(train_data)

val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)

test_data = TensorDataset(test_seq, test_mask)
test_sampler = SequentialSampler(test_data)

In [None]:
batch_size = 220

train_loader = DataLoader(train_data, 
                              sampler=train_sampler, 
                              batch_size=batch_size,
#                               collate_fn=data_collator
                              )

val_loader = DataLoader(val_data, 
                            sampler = val_sampler, 
                            batch_size=batch_size,
#                             collate_fn=data_collator
                            )
test_loader = DataLoader(test_data,
                            sampler = test_sampler,
                            batch_size=batch_size)

In [None]:
def deleteEncodingLayers(model, num_layers_to_keep): 
    old_module_list = model.distilbert.transformer.layer
    new_module_list = nn.ModuleList()

    for i in range(0, num_layers_to_keep):
        new_module_list.append(old_module_list[i])

    model_copy = copy.deepcopy(model)
    model_copy.distilbert.transformer.layer = new_module_list

    return model_copy

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels = 1)
model.dropout = nn.Dropout(p=0.1, inplace=False)
model = deleteEncodingLayers(model, 3)

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device);

In [None]:
LEARN_RATE = 3e-5
optimizer = AdamW(model.parameters(),
                  lr = LEARN_RATE, 
                  eps = 1e-8 
                  )
epochs = 2
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps
                                            )
criterion = nn.BCEWithLogitsLoss()

In [None]:
def calculate_accuracy(output, target):
    output = torch.sigmoid(output) >= 0.5
    target = target == 1.0
    return torch.true_divide((target == output).sum(dim=0), output.size(0)).item()

In [None]:
class MetricMonitor:
    def __init__(self, float_precision=3):
        self.float_precision = float_precision
        self.reset()

    def reset(self):
        self.metrics = defaultdict(lambda: {"val": 0, "count": 0, "avg": 0})

    def update(self, metric_name, val):
        metric = self.metrics[metric_name]

        metric["val"] += val
        metric["count"] += 1
        metric["avg"] = metric["val"] / metric["count"]

    def __str__(self):
        return " | ".join(
            [
                "{metric_name}: {avg:.{float_precision}f}".format(
                    metric_name=metric_name, avg=metric["avg"], float_precision=self.float_precision
                )
                for (metric_name, metric) in self.metrics.items()
            ]
        )

In [None]:
def train(train_loader, model, criterion, optimizer, epoch):
    metric_monitor = MetricMonitor()
    model.train()
    stream = tqdm(train_loader)
    for batch in stream:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        output = model(b_input_ids, 
                       attention_mask=b_input_mask, 
                       labels=b_labels).logits
        loss = criterion(output, b_labels)
        accuracy = calculate_accuracy(output, b_labels)
        metric_monitor.update("Loss", loss.item())
        metric_monitor.update("Accuracy", accuracy)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        stream.set_description(
            "Epoch: {epoch}. Train.      {metric_monitor}".format(epoch=epoch, metric_monitor=metric_monitor)
        )

In [None]:
def validate(val_loader, model, criterion, epoch):
    metric_monitor = MetricMonitor()
    model.eval()
    stream = tqdm(val_loader)
    with torch.no_grad():
        for batch in stream:
                b_input_ids = batch[0].to(device)
                b_input_mask = batch[1].to(device)
                b_labels = batch[2].to(device)
                output = model(b_input_ids, 
                               attention_mask=b_input_mask, 
                               labels=b_labels).logits
            
                loss = criterion(output, b_labels)
                accuracy = calculate_accuracy(output, b_labels)

                metric_monitor.update("Loss", loss.item())
                metric_monitor.update("Accuracy", accuracy)
                stream.set_description(
                    "Epoch: {epoch}. Validation. {metric_monitor}".format(epoch=epoch, metric_monitor=metric_monitor)
                )

In [None]:
for epoch in range(1, epochs + 1):
    train(train_loader, model, criterion, optimizer, epoch)
    validate(val_loader, model, criterion, epoch)

In [None]:
def predict(test_loader, model):
    model.eval()
    stream = tqdm(test_loader)
    
    with torch.no_grad():
        for step, batch in tqdm(enumerate(stream)) :
                b_input_ids = batch[0].to(device)
#                 print(b_input_ids)
                b_input_mask = batch[1].to(device)
                output = model(b_input_ids, 
                               attention_mask=b_input_mask).logits
                pred_probs = (output>0.5).int()
                pred_probas_sigma = torch.sigmoid(output)
                if step == 0:
                    predictions = pred_probs.cpu().detach().numpy()
                    pred_probas = pred_probas_sigma.cpu().detach().numpy()
                else:
                    predictions = np.append(predictions, pred_probs.cpu().detach().numpy(), axis=0)
                    pred_probas = np.append(pred_probas, pred_probas_sigma.cpu().detach().numpy(), axis=0)
                
    return predictions, pred_probas
                
                
results, res_probas = predict(test_loader, model)

predictions_df = pd.DataFrame(results, columns = ['toxic'])
submission = pd.concat([test_df["comment_id"], predictions_df], axis=1)
submission.to_csv('submission.csv', index=False, header=True)

In [None]:
val_results, res_probas = predict(val_loader, model)
comparison = torch.eq(torch.Tensor(val_results), val_y)
diff = (comparison == 0).nonzero(as_tuple=True)[0]

In [None]:
def deleteEncodingLayers2(model, num_layers_to_keep): 
    old_module_list = model.bert.encoder.layer
    new_module_list = nn.ModuleList()

    for i in range(0, num_layers_to_keep):
        new_module_list.append(old_module_list[i])

    model_copy = copy.deepcopy(model)
    model_copy.bert.encoder.layer = new_module_list

    return model_copy

In [None]:
checkpoint2 = "DeepPavlov/rubert-base-cased"
tokenizer2 = AutoTokenizer.from_pretrained(checkpoint2)

train_tokens = tokenizer2.batch_encode_plus(train_df["comment"].tolist(),
                                           max_length = 200,
                                           padding=True,
                                           truncation=True,
                                           return_token_type_ids=False
                                           )

val_tokens = tokenizer2.batch_encode_plus(val_df["comment"].tolist(),
                                         max_length = 200,
                                         padding=True,
                                         truncation=True,
                                         return_token_type_ids=False
                                         )

test_tokens = tokenizer2.batch_encode_plus(test_df["comment"].tolist(),
                                          max_length = 200,
                                          padding=True,
                                          truncation=True,
                                          return_token_type_ids=False
                                          )

train_seq = torch.tensor(train_tokens['input_ids'])
train_mask = torch.tensor(train_tokens['attention_mask'])
train_y = torch.tensor(train_df.toxic.to_numpy()).unsqueeze(-1)

val_seq = torch.tensor(val_tokens['input_ids'])
val_mask = torch.tensor(val_tokens['attention_mask'])
val_y = torch.tensor(val_df.toxic.to_numpy()).unsqueeze(-1)

test_seq = torch.tensor(test_tokens['input_ids'])
test_mask = torch.tensor(test_tokens['attention_mask'])

train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(train_data)

val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)

test_data = TensorDataset(test_seq, test_mask)
test_sampler = SequentialSampler(test_data)

batch_size = 220

train_loader = DataLoader(train_data,
                              sampler=train_sampler, 
                              batch_size=batch_size,
#                               collate_fn=data_collator
                              )

val_loader = DataLoader(val_data,
                            sampler = val_sampler, 
                            batch_size=batch_size,
#                             collate_fn=data_collator
                            )
test_loader = DataLoader(test_data,
                            sampler = test_sampler,
                            batch_size=batch_size)

model2 = AutoModelForSequenceClassification.from_pretrained(checkpoint2, num_labels = 1)
# model2.dropout = nn.Dropout(p=0.1, inplace=False)
model2 = deleteEncodingLayers2(model2, 2)

model2.to(device);

optimizer2 = AdamW(model2.parameters(),
                  lr = LEARN_RATE, 
                  eps = 1e-8 
                  )
epochs = 4
total_steps = len(train_loader) * epochs
scheduler2 = get_linear_schedule_with_warmup(optimizer2, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps
                                            )
criterion2 = nn.BCEWithLogitsLoss()

for epoch in range(1, epochs + 1):
    train(train_loader, model2, criterion2, optimizer2, epoch)
    validate(val_loader, model2, criterion2, epoch)

In [None]:
val_results2, res_probas2 = predict(val_loader, model2)
comparison2 = torch.eq(torch.Tensor(val_results2), val_y)
diff2 = (comparison2 == 0).nonzero(as_tuple=True)[0]

In [None]:
ens = np.concatenate((res_probas, res_probas2), axis=1)
alpha = 0.8
results = np.average(ens, axis=1, weights=[alpha,1 - alpha])
results_sigma = (results>=0.5)

comparison3 = torch.eq(torch.Tensor(results_sigma).int().unsqueeze(-1), val_y)

diff3 = (comparison3 == 0).nonzero(as_tuple=True)[0]
diff = (comparison == 0).nonzero(as_tuple=True)[0]

In [None]:
target = val_y == 1.0
torch.true_divide((target == torch.Tensor(results_sigma).unsqueeze(-1)).sum(dim=0), results_sigma.shape[0]).item()