In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from transformers import BertTokenizer
import torch
import numpy as np
import pandas as pd

tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')

In [None]:
df = pd.read_csv('dataset.csv', encoding='utf-16')
df.tail()

Unnamed: 0,comment,toxic
11667,надеюсь это постирония дваче стало фашистов ра...,1
11668,вообще повезет хотя последнее время рекламы ст...,0
11669,гдето сжигают книги гдето фотографов возможно ...,1
11670,миллионную биткоина,0
11671,бонусом регулярная реклама яндекс обрыгаловки,0


In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, df, label_name, text_name):
        self.labels = [label for label in df[label_name]]
        self.data = [tokenizer(str(text), 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df[text_name]]
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.data[idx], np.array(self.labels[idx])

In [None]:
from torch import nn
from transformers import BertModel

class BertClassifier(nn.Module):
    def __init__(self, dropout=0.5):
        super(self.__class__, self).__init__()
        
        self.bert = BertModel.from_pretrained('DeepPavlov/rubert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 2)
        self.relu = nn.ReLU()
        
    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer
    

In [None]:
df_train = df.sample(frac=0.85, random_state=25)
df_val = df.drop(df_train.index)

print(f"No. of training examples: {df_train.shape[0]}")
print(f"No. of testing examples: {df_val.shape[0]}")

No. of training examples: 9921
No. of testing examples: 1751


In [None]:
df_test = pd.read_csv('dataset_test.csv', encoding='utf-16')

In [None]:
def evaluate(model, test_data):
    preds = list()

    test = Dataset(test_data, 'toxic', 'comment')

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2, shuffle=False)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    model.eval()

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

              test_label = test_label.to(device)
              
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)

              acc = (output.argmax(dim=1) == test_label).sum().item()
              total_acc_test += acc

              for x in output.argmax(dim=1):
                  preds.append(x.item())

    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')

    return preds

In [None]:
from torch.optim import Adam
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, roc_auc_score, f1_score

def train(df_test, model, train_data, val_data, learning_rate, epochs, batch_size):
    train, val = Dataset(train_data, 'toxic', 'comment'), Dataset(val_data, 'toxic', 'comment')

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=batch_size)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)

                
                batch_loss = criterion(output, train_label)
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label)
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')
            

            preds = evaluate(model, df_test)
            df_test = df_test.assign(preds=preds)

            precision = precision_score(df_test['toxic'], df_test['preds'])
            recall = recall_score(df_test['toxic'], df_test['preds'])
            roc_auc = roc_auc_score(df_test['toxic'], df_test['preds'])
            f1 = f1_score(df_test['toxic'], df_test['preds'])

            print(f'precision: { precision } ')
            print(f'recall: { recall } ')
            print(f'roc_auc: { roc_auc } ')
            print(f'f1: { f1 }')
                  
EPOCHS = 4
model = BertClassifier()
LR = 2e-5
batch_size = 16
              
train(df_test, model, df_train, df_val, LR, EPOCHS, batch_size)

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 621/621 [15:13<00:00,  1.47s/it]


Epochs: 1 | Train Loss:  0.019                 | Train Accuracy:  0.874                 | Val Loss:  0.014                 | Val Accuracy:  0.909
Test Accuracy:  0.920
precision: 0.8931623931623932 
recall: 0.865424430641822 
roc_auc: 0.9066433936316514 
f1: 0.879074658254469


100%|██████████| 621/621 [15:13<00:00,  1.47s/it]


Epochs: 2 | Train Loss:  0.009                 | Train Accuracy:  0.949                 | Val Loss:  0.018                 | Val Accuracy:  0.902
Test Accuracy:  0.906
precision: 0.8954545454545455 
recall: 0.8157349896480331 
roc_auc: 0.8838841788698979 
f1: 0.8537378114842903


100%|██████████| 621/621 [15:13<00:00,  1.47s/it]


Epochs: 3 | Train Loss:  0.005                 | Train Accuracy:  0.978                 | Val Loss:  0.018                 | Val Accuracy:  0.900
Test Accuracy:  0.911
precision: 0.8256880733944955 
recall: 0.9316770186335404 
roc_auc: 0.9163077481071769 
f1: 0.8754863813229572


100%|██████████| 621/621 [15:12<00:00,  1.47s/it]


Epochs: 4 | Train Loss:  0.003                 | Train Accuracy:  0.987                 | Val Loss:  0.018                 | Val Accuracy:  0.903
Test Accuracy:  0.916
precision: 0.9004424778761062 
recall: 0.8426501035196687 
roc_auc: 0.8978631122395007 
f1: 0.8705882352941177


In [None]:
import torch

torch.save(model, '/content/model.pt')
torch.save(model.state_dict(), '/content/model_dict.pt')

In [None]:
preds = evaluate(model, df_test)

Test Accuracy:  0.913


In [None]:
df_test = df_test.assign(preds=preds)

In [None]:
from sklearn.metrics import precision_score, recall_score, roc_auc_score, f1_score

precision = precision_score(df_test['toxic'], df_test['preds'])
recall = recall_score(df_test['toxic'], df_test['preds'])
roc_auc = roc_auc_score(df_test['toxic'], df_test['preds'])
f1 = f1_score(df_test['toxic'], df_test['preds'])

print(f'precision: { precision } ')
print(f'recall: { recall } ')
print(f'roc_auc: { roc_auc } ')
print(f'f1: { f1 }')


#prec 0.908
#recall 0.867
#roc_auc: 0.9118496017893035 
#f1: 0.8877118644067797

precision: 0.8348968105065666 
recall: 0.9213250517598344 
roc_auc: 0.9147813997068203 
f1: 0.8759842519685039


In [None]:
model = torch.load('/content/drive/MyDrive/model.pt')
model.eval()

BertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stopword = stopwords.words('russian')

def remove_stopwords(text):
    words = text.split()
    ans = ''
    for word in words:
        if word not in stopword:
            ans += word + ' '
    
    return ans

'''
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

def lemmatize(text):
    words = text.split() # разбиваем текст на слова
    res = list()
    for word in words:
        p = morph.parse(word)[0]
        res.append(p.normal_form + ' ')

    return ''.join(res)
'''


def predict(model, text):
    text = text.lower() #приводим к нижнему регистру
    text = remove_stopwords(text)
    #text = lemmatize(text)
    print(text)

    tokenized = tokenizer(text, 
                padding='max_length', max_length = 512, truncation=True,
                return_tensors="pt")
    
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()
    
    with torch.no_grad():
        mask = tokenized['attention_mask'].to(device)
        input_id = tokenized['input_ids'].squeeze(1).to(device)
        output = model(input_id, mask)
    
    if(output.argmax(dim=1) == 0):  print('ok')
    if(output.argmax(dim=1) == 1): print('toxic')

    return output.argmax(dim=1)
    


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
predict(model, 'да и что?')

что? 
ok


tensor([0], device='cuda:0')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
