In [6]:
import pandas as pd
import numpy as np
import transformers as ppb # pytorch transformers
from sklearn.model_selection import train_test_split
import torch
import math
from datetime import datetime
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [2]:
model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Загрузка предобученной модели/токенизатора 
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
try:
    # try to load from preprocessed ndarray file
    padded = np.load('datasets/padded.npy')
    print('Padded array loaded from cache.')
except IOError:
    # loding and preprocessing raw data
    local_data = 'datasets/toxic_comments.csv'
    try:
        df = pd.read_csv(local_data)
        print('dataset loaded from cache')
    except IOError:
        print('downloding dataset...')
        df = pd.read_csv('https://code.s3.yandex.net/datasets/toxic_comments.csv')
        df = df.drop(df.columns[0], axis=1)
        df.to_csv(local_data, index=False)
        print('dataset downloded and saved to cache')
        
    # Сделаем токенизацию
    print('Making tikenization...')
    tokenized = df['text'].apply((lambda x: tokenizer.encode(x[:512], add_special_tokens=True)))
    
    # Найдём максимальную длину списка
    max_len = len(max(tokenized, key=len))

    # Приведем весь список к одинаковой длине
    padded = np.array([i + [0]*(max_len - len(i)) for i in tokenized.values])
    
    # Сохраним массив в кеш
    np.save('datasets/padded.npy', padded)
    print('padded array saved to cache.')
    
attention_mask = np.where(padded != 0, 1, 0)

downloding dataset...
dataset downloded and saved to cache
Making tikenization...
padded array saved to cache.


In [4]:
def timer(func):
    """Timer decorator"""
    def wrapper(*args, **kwargs):
        start = datetime.now()
        result = func(*args, **kwargs)
        total_time = (datetime.now() - start).total_seconds()
        print(f'Total timelimit: {total_time}')
        return result
    return wrapper

@timer
def make_features(model, padded, attention_mask, batch_size: int, use_cache:bool = True, mmap_mode:bool = False) -> np.ndarray:
    """Function makes features from padded array

    Args:
        model (torch model): pretrined torch model
        padded (ndarry): padded array
        attention_mask (ndarray): attention mask
        batch_size (int): batch size for prediction
        use_cache (bool, optional): use cahce for save/load file. Defaults to True.
        mmap_mode (bool, optional): Read or load features file from cache. Defaults to False.

    Returns:
        np.ndarray: features array
    """
    if use_cache:
        try:
            # try to load embeddings from ndarray file
            if mmap_mode:
                features = np.load('datasets/features.npy', mmap_mode='r')
            else:
                features = np.load('datasets/features.npy')
            print('Features loaded from cache.')
            return features
        except IOError:
            pass      
            
    cuda_available = torch.cuda.is_available()
    if cuda_available:
        # Select first CUDA device
        device = torch.device("cuda:0")
        print("Используется устройство:", torch.cuda.get_device_name(device))

        # move model to CUDA device
        model = model.to(device)

    embeddings = []
    loop_range = range(math.ceil(padded.shape[0] / batch_size))
    timer_arr = [] # array for calculate execution time
    for i in loop_range:
        timer = datetime.now()
        batch = torch.LongTensor(padded[batch_size*i:batch_size*(i+1)]) 
        attention_mask_batch = torch.LongTensor(attention_mask[batch_size*i:batch_size*(i+1)])
        
        if cuda_available:
            # move tensors to CUDA device
            batch = batch.to(device)
            attention_mask_batch = attention_mask_batch.to(device)
            
        with torch.no_grad():
            batch_embeddings = model(batch, attention_mask=attention_mask_batch)
            
        # .cpu() method needs if CUDA device used
        embeddings.append(batch_embeddings[0][:,0,:].cpu().numpy())
        
        timer_arr.append((datetime.now() - timer).total_seconds())
        
        # print statistic
        if i%100 == 0:
            mean_iteration_time = np.array(timer_arr).mean()
            timeleft = int((len(loop_range) - i) * mean_iteration_time)
            print(f'{i}/{len(loop_range)} timeleft {timeleft} sec.')
    
    features = np.concatenate(embeddings)
    
    if use_cache:
        # Сохраним массив в кеш
        np.save('datasets/features.npy', features)
        print('features array saved to cache.')
    
    return features
    

In [5]:
features = make_features(model, padded, attention_mask, 128)

Используется устройство: NVIDIA GeForce RTX 3060 Laptop GPU
0/1245 timeleft 5336 sec.
100/1245 timeleft 3939 sec.
200/1245 timeleft 3598 sec.
300/1245 timeleft 3259 sec.
400/1245 timeleft 2916 sec.
500/1245 timeleft 2572 sec.
600/1245 timeleft 2229 sec.
700/1245 timeleft 1884 sec.
800/1245 timeleft 1539 sec.
900/1245 timeleft 1193 sec.
1000/1245 timeleft 847 sec.
1100/1245 timeleft 501 sec.
1200/1245 timeleft 155 sec.
features array saved to cache.
Total timelimit: 4311.726253


In [9]:
x_train, x_test, y_train, y_test = train_test_split(
    features, df['toxic'], test_size=0.25)

# обучите и протестируйте модель
lr_model = LogisticRegression(max_iter=2000)
lr_model.fit(x_train, y_train)
pred = lr_model.predict(x_test)

In [10]:
f1_score(y_test, pred)

0.7298747763864043