In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
print(gpu_info)

Thu Oct 21 14:26:49 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.74       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   73C    P8    34W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

В нашем распоряжении видеокарта Tesla K80. Неплохо! Используем ее на всю, так сказать.

In [14]:
import numpy as np
import pandas as pd
import re

import torch
import transformers
from tqdm import tqdm

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings('ignore')

In [15]:
data = pd.read_csv('/content/toxic_comments.csv')
data.shape

(159571, 2)

In [16]:
# Функция для очистки текста и приведения к нижнему регистру

def clean(text):
    
    text = text.lower()    
    text = re.sub(r"(?:\n|\r)", " ", text)
    text = re.sub(r"[^a-zA-Z ]+", "", text).strip()
    
    return text

In [17]:
data['text_clean'] = data['text'].apply(clean)
data[['text', 'text_clean']].sample(5)

Unnamed: 0,text,text_clean
55936,Green Tea and CigarettesWhy you're a worthless...,green tea and cigaretteswhy youre a worthless ...
142392,"""\n\n Non-enyclopedic? \n\n""""On February 24, 1...",nonenyclopedic on february iowa put to sea...
118335,"""\nHi, the page that I wanted deleting was the...",hi the page that i wanted deleting was the pag...
130840,Well the article i created then smart ass. why...,well the article i created then smart ass why ...
81788,"""\n\nSpeedy deletion of Www.wikipedia.com\n A ...",speedy deletion of wwwwikipediacom a tag has ...


In [18]:

tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')  # Создаем токенизатор
tokenized = data['text'][:2500].apply(lambda x: tokenizer.encode(x,truncation=True, add_special_tokens=True)) #Производим токенизацию текстов в датасете

max_len = 0  #Находим максимальную длину токенизированного комментария
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

# Создаем матрицу размера max_len x n_tokens (размер максимального токенизированного комментария на кол-во текстов) и маску
padded = np.array([i + [0]*(max_len - len(i)) for i in tokenized.values])
attention_mask = np.where(padded != 0, 1, 0)

# Используем заранее обученную BERT-модель
model = transformers.BertModel.from_pretrained('bert-base-uncased')

# Задаем размер батча, т.е. такое кол-во комментариев, которое будет принимать на вход модель
batch_size = 100
embeddings = []

for i in tqdm(range(padded.shape[0] // batch_size)):
        # проходим по batch_size, 0...100, 100....200, и т.д. то есть берем по кусочку матриц
        # padded и attention_mask и оборачиваем это в тензоры
        batch = torch.LongTensor(padded[batch_size*i:batch_size*(i+1)]) 
        attention_mask_batch = torch.LongTensor(attention_mask[batch_size*i:batch_size*(i+1)])
        # no_grad = без обучения
        with torch.no_grad():
            # получаем готовый эмбеддинг по батчу и маске
            batch_embeddings = model(batch, attention_mask=attention_mask_batch)
        # добавляем полученный эмбеддинг в список, переведя его в массив numpy
        embeddings.append(batch_embeddings[0][:,0,:].numpy())

# Склеиваем полученные значения в один массив numpy
features = np.concatenate(embeddings)

X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(features), 
                                                    data['toxic'][:2500],
                                                    test_size=0.2,
                                                    random_state=42)



model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)
pred = model.predict(X_test)
print(f1_score(pred, y_test))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 25/25 [1:12:37<00:00, 174.31s/it]


0.7241379310344829
