In [3]:
import pandas as pd
import nltk
import numpy as np
from tqdm import tqdm
from nltk.corpus import stopwords as nltk_stopwords
from nltk.corpus import wordnet
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score

In [4]:
from sklearn.model_selection import GridSearchCV

In [5]:
import re

In [6]:
import torch
import transformers
from tqdm import notebook

In [15]:
#data = data[:24000]
data_max, data = train_test_split(data_all, test_size=0.05)

In [17]:
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

In [18]:
def lem(sentence):
    word_list = nltk.word_tokenize(sentence)
    return ' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in word_list])

In [19]:
def clear_text(sentence):
     return " ".join(re.sub(r'[^a-z]', ' ', sentence.lower()).split())

In [23]:
tokenizer = transformers.BertTokenizer.from_pretrained('unitary/toxic-bert')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/174 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/811 [00:00<?, ?B/s]

In [24]:
tokenized = data['text'].apply(
    lambda x: tokenizer.encode(x, max_length=128, truncation=True, add_special_tokens=True))

In [25]:
tokenized = pd.Series(tokenized)

In [26]:
tokenized

105574    [101, 8840, 2140, 12256, 3995, 8534, 2033, 215...
91474     [101, 2026, 10086, 2015, 1999, 18020, 1006, 25...
59638     [101, 1000, 8089, 1024, 2009, 3544, 2008, 2070...
33521     [101, 2009, 2001, 1999, 1996, 2381, 3931, 2043...
39259     [101, 2339, 1029, 2515, 2053, 2028, 2293, 2017...
                                ...                        
10805     [101, 1000, 2013, 2308, 1005, 1055, 3455, 2000...
84401     [101, 4748, 10020, 3531, 2693, 1045, 9554, 258...
117489    [101, 1000, 1045, 1005, 2310, 15826, 2058, 199...
4695      [101, 1045, 2228, 1045, 10308, 2023, 3392, 232...
105540    [101, 1000, 1024, 1024, 1024, 3374, 2085, 6132...
Name: text, Length: 7965, dtype: object

In [27]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

In [28]:
padded = np.array([i + [0]*(max_len - len(i)) for i in tokenized.values])

In [29]:
attention_mask = np.where(padded != 0, 1, 0)

In [33]:
model = transformers.BertModel.from_pretrained(
    'unitary/toxic-bert')

Some weights of the model checkpoint at unitary/toxic-bert were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [34]:
batch_size = 32
embeddings = []
for i in notebook.tqdm(range(padded.shape[0] // batch_size)):
        batch = torch.LongTensor(padded[batch_size*i:batch_size*(i+1)])
        attention_mask_batch = torch.LongTensor(attention_mask[batch_size*i:batch_size*(i+1)])
        
        
        with torch.no_grad():
            batch_embeddings = model(batch, attention_mask=attention_mask_batch)
        
        
        embeddings.append(batch_embeddings[0][:,0,:].numpy())

  0%|          | 0/248 [00:00<?, ?it/s]

In [35]:
features = np.concatenate(embeddings)
features = pd.DataFrame(features)

In [36]:
batch = torch.LongTensor(padded[len(features):])
attention_mask_batch = torch.LongTensor(attention_mask[len(features):])
        
        
with torch.no_grad():
    batch_embeddings = model(batch, attention_mask=attention_mask_batch)
        
        
embeddings.append(batch_embeddings[0][:,0,:].numpy())

In [38]:
features = np.concatenate(embeddings)
features = pd.DataFrame(features)

In [39]:
features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.562459,-0.775878,0.785395,-0.645714,0.819423,0.393917,0.455884,0.295968,-0.268797,-0.316008,...,-0.559729,-0.496388,0.482889,-0.577940,-0.397803,0.490136,-0.504639,-0.470022,0.580904,0.606350
1,-0.698587,-0.990393,0.414407,-0.470985,0.944293,0.252206,-0.179972,-0.012613,-0.368699,-0.555113,...,-0.591524,-1.303957,0.229112,-0.646100,0.158193,0.812776,-0.536130,-0.878739,0.385206,0.077280
2,-0.687120,-1.016031,0.429747,-0.464243,1.084493,0.292625,-0.201261,-0.072167,-0.572984,-0.658635,...,-0.476845,-1.317816,0.221509,-0.830498,0.137587,0.928455,-0.510003,-0.635855,0.289075,-0.119535
3,-0.561410,-1.086236,0.671387,-0.528845,0.978759,0.403805,-0.078692,-0.090955,-0.503623,-0.530652,...,-0.709201,-1.231190,0.211226,-0.665865,0.134830,0.746849,-0.402038,-0.696974,0.526012,0.059630
4,-0.507328,-0.607363,0.449218,-0.549763,0.941412,0.352281,0.731522,0.312153,-0.316915,-0.378545,...,-0.599202,-0.785281,0.553970,-0.616136,0.042383,0.634816,-0.252078,-0.630700,0.618019,0.463434
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7960,-0.734001,-0.971238,0.347850,-0.538306,1.042069,0.507167,-0.153121,0.066395,-0.584705,-0.645466,...,-0.709683,-1.151038,0.143474,-0.505673,0.172672,0.490276,-0.540128,-0.647975,0.732148,-0.103960
7961,-0.525466,-0.884406,0.656054,-0.485107,0.990944,0.207554,-0.112538,0.076889,-0.405421,-0.609634,...,-0.574846,-1.270946,0.230102,-0.682882,0.157378,0.893004,-0.467775,-0.815766,0.428942,0.171502
7962,-0.543005,-0.920595,0.489758,-0.424582,0.985051,0.198469,-0.172937,0.068317,-0.396422,-0.667548,...,-0.455504,-1.232768,0.228959,-0.753407,0.145166,0.876641,-0.560120,-0.719095,0.410694,0.076620
7963,-0.574627,-1.037518,0.392460,-0.635311,0.991322,0.404124,-0.159394,-0.033333,-0.377858,-0.496677,...,-0.689997,-1.166807,0.148243,-0.637227,0.211839,0.844834,-0.520845,-0.664164,0.428771,0.038504


In [40]:
features_train, features_test, target_train, target_test = \
train_test_split(features, data['toxic'], \
                 test_size=0.5)

**Обучение**

**Логистическая регрессия**

In [54]:
model_lr = LogisticRegression()
model_lr.fit(features_train, target_train)
prediction = model_lr.predict(features_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [55]:
result = f1_score(target_test, prediction)

In [56]:
result

0.9170854271356784

**Вывод:**
\
\
Была поставлена задача изучить данные о позитивности/негативности комментариев клиентов в проекте магазина "Викишоп", обучить модель на этих данных. Изначально данные были представлены в виде текстов с большим числом лишних символов. Их удалось привести к надлежащему виду. Но результата высокого получить не удалось в связи с трудностями в подборе нужных параметров. Была предпринята попытка обучить модель на предобученной модели BERT toxic-bert. Для экономии памяти и времени исходный датасет был значительно урезан. Параметр max_length был взят равным 128. Метрика f1 на тестовой выборке получилась равной 0.9170854271356784. 