In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm
import torch

In [None]:
df = pd.read_csv('../input/toxicityclassifier/toberttoxic3ep.csv')

In [None]:
tokenizer = BertTokenizer.from_pretrained('SkolkovoInstitute/russian_toxicity_classifier')
model = BertForSequenceClassification.from_pretrained('SkolkovoInstitute/russian_toxicity_classifier')

In [None]:
DEVICE = 'cpu'

In [None]:
def convert_to_dataset_torch(data: pd.DataFrame) -> TensorDataset:
    input_ids = []
    attention_masks = []
    token_type_ids = []
    for _, row in tqdm(data.iterrows(), total=data.shape[0]):
        encoded_dict = tokenizer.encode_plus(row["civil"], max_length=512,
                                             pad_to_max_length=True,
                                             return_attention_mask=True, return_tensors='pt', truncation=True)
        # Add the encoded sentences to the list.
        input_ids.append(encoded_dict['input_ids'])
        token_type_ids.append(encoded_dict["token_type_ids"])
        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks.append(encoded_dict['attention_mask'])

    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    token_type_ids = torch.cat(token_type_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    
    input_ids.to(DEVICE, dtype=torch.long)
    token_type_ids.to(DEVICE, dtype=torch.long)
    attention_masks.to(DEVICE, dtype=torch.long)
    return TensorDataset(input_ids, attention_masks, token_type_ids)

In [None]:
test = convert_to_dataset_torch(df)

In [None]:
def eval_batch(dataloader, model):
    total_eval_accuracy = 0
    total_eval_loss = 0
    embs=[]

    for batch in tqdm(dataloader, desc="Evaluating", unit="batch"):
        input_ids, attention_masks, token_type_ids = batch
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            m = model(input_ids,
                       token_type_ids=token_type_ids,
                       attention_mask=attention_masks,
                       ).logits
            

            embs.append(m.detach().cpu().numpy())
    return embs        

In [None]:
test_dataloader = DataLoader(test, batch_size=1)

In [None]:
embeddings = eval_batch(test_dataloader, model)

In [None]:
softmax = torch.nn.Softmax()

In [None]:
embs = torch.Tensor(embeddings)

In [None]:
embs = embs.squeeze(1)

In [None]:
preds = softmax(embs)

In [None]:
dupl = pd.DataFrame(preds)

In [None]:
dupl.columns = ['nontoxic', 'toxic']

In [None]:
df1 = pd.concat([df, dupl['toxic']], axis=1)

In [None]:
df1.toxic = df1.toxic.apply(lambda x: x.item())

In [None]:
!pip install openpyxl

In [None]:
df1.to_excel('3eptoxicity.xlsx')