# Начальная инициализация

In [1]:
import datetime
print(datetime.datetime.now().strftime('%d-%B-%Y %H:%M:%S'))

10-March-2023 07:56:07


In [1]:
import lightning
import gensim
import pickle
import numpy as np
import pandas as pd
import nltk
import pymorphy3
import re
import os
from tqdm import tqdm
import random
import torch
import sklearn.model_selection
import torchmetrics
import IPython.display

# scripts
from scripts.rus_preprocessing_udpipe import Preprocessing

# Download nltk data if it is not present
# nltk.download('punkt)
# nltk.download('stopwords')

random_state = 42
random.seed(random_state)
np.random.seed(random_state)

In [2]:
def word_averaging(model, words):
    all_words, mean = set(), []

    for word in words:
        if hasattr(model, "key_to_index"):
            if word in model.key_to_index:
                mean.append(model[model.key_to_index[word]])
                all_words.add(word)
        else:
            if word in model.wv:
                mean.append(model.wv[word])
                all_words.add(word)

    if not mean:
        return np.zeros(model.vector_size)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)

    return mean


def word_averaging_list(model, text_list):
    return np.vstack([word_averaging(model, comment_text) for comment_text in tqdm(text_list)])

In [12]:
print(datetime.datetime.now().strftime('%d-%B-%Y %H:%M:%S'))

10-March-2023 07:56:12


# Предобработка текста

In [3]:
DATAPATH = 'data/'
train = pd.read_csv(DATAPATH + 'X_y_train.csv', sep=';')
test = pd.read_csv(DATAPATH + 'X_y_test.csv', sep=';')

X_train = [el[0] for el in train[['Text']].values]
X_test = [el[0] for el in test[['Text']].values]
y_train = [el[0] for el in train[['Class']].replace(-1, 0).values]
y_test = [el[0] for el in test[['Class']].replace(-1, 0).values]

In [4]:
def tokenize(text, morph):
  text = re.sub('[^А-Яа-я\s]', '', text).strip()
  tokens = nltk.word_tokenize(text.lower())
  tokens = [morph.parse(el)[0].normal_form for el in tokens]
  tokens = [el for el in tokens if el not in nltk.corpus.stopwords.words('russian')]
  return tokens

morph = pymorphy3.MorphAnalyzer()
X_train_token = [tokenize(t, morph) for t in tqdm(X_train)]
X_test_token = [tokenize(t, morph) for t in tqdm(X_test)]

100%|██████████| 22348/22348 [03:01<00:00, 122.89it/s]
100%|██████████| 50/50 [00:00<00:00, 123.83it/s]


# Word2vec

## Self-trained

In [5]:
w2v_model = gensim.models.Word2Vec(sentences=X_train_token, vector_size=300, window=5, min_count=1, workers=os.cpu_count(), seed=random_state)
w2v_model.build_vocab(X_train_token)
w2v_model.train(X_train_token, total_examples=w2v_model.corpus_count, epochs=300, report_delay=1)

# save trained model
# !mkdir self-trained_word2vec
# w2v_model.save('self-trained_word2vec/word2vec.model')

(45340485, 47914800)

In [6]:
X_train_w2v_self_trained = word_averaging_list(w2v_model, X_train_token)
X_test_w2v_self_trained = word_averaging_list(w2v_model, X_test_token)

100%|██████████| 22348/22348 [00:01<00:00, 12751.99it/s]
100%|██████████| 50/50 [00:00<00:00, 10027.98it/s]


In [24]:
print(datetime.datetime.now().strftime('%d-%B-%Y %H:%M:%S'))

10-March-2023 07:57:24


## Pre-trained

Udpipe разметка

In [7]:
X_train_token_pre_trained = []
preprocessing = Preprocessing(udpipe_filename='models/udpipe_syntagrus.model')
for input_line in tqdm(X_train_token):
    input_line = ' '.join(input_line) # need to concat tokens with preprocessing
    res = preprocessing.unify_sym(input_line.strip())
    output = preprocessing.process(text=res)
    X_train_token_pre_trained.append(output)


X_test_token_pre_trained = []
for input_line in tqdm(X_test_token):
    input_line = ' '.join(input_line) # need to concat tokens with preprocessing
    res = preprocessing.unify_sym(input_line.strip())
    output = preprocessing.process(text=res)
    X_test_token_pre_trained.append(output)

100%|██████████| 22348/22348 [02:45<00:00, 134.65it/s]
100%|██████████| 50/50 [00:00<00:00, 136.70it/s]


In [8]:
w2v_model = gensim.models.KeyedVectors.load_word2vec_format('models/word2vec/model.bin', encoding='utf-8', unicode_errors='ignore', binary=True)
w2v_model.fill_norms(force=True)

X_train_w2v_pre_trained = word_averaging_list(w2v_model, X_train_token_pre_trained)
X_test_w2v_pre_trained = word_averaging_list(w2v_model, X_test_token_pre_trained)

100%|██████████| 22348/22348 [00:01<00:00, 13651.16it/s]
100%|██████████| 50/50 [00:00<00:00, 12632.68it/s]


# FastText

## Self-trained

In [9]:
ft_model = gensim.models.FastText(sentences=X_train_token, vector_size=300, window=5, min_count=1, workers=os.cpu_count(), seed=random_state)
ft_model.build_vocab(X_train_token)
ft_model.train(X_train_token, total_examples=ft_model.corpus_count, epochs=300, report_delay=1)

# save trained model
# !mkdir self-trained_fasttext
# ft_model.save('self-trained_fasttext/fasttext.model')

(45341567, 47914800)

In [10]:
X_train_ft_self_trained = word_averaging_list(ft_model, X_train_token)
X_test_ft_self_trained = word_averaging_list(ft_model, X_test_token)

100%|██████████| 22348/22348 [00:01<00:00, 13206.11it/s]
100%|██████████| 50/50 [00:00<00:00, 5550.52it/s]


In [32]:
print(datetime.datetime.now().strftime('%d-%B-%Y %H:%M:%S'))

10-March-2023 09:13:34


## Pre-trained

In [11]:
ft_model = gensim.models.KeyedVectors.load('models/fasttext/model.model')
ft_model.fill_norms(force=True)

X_train_ft_pre_trained = word_averaging_list(ft_model, X_train_token)
X_test_ft_pre_trained = word_averaging_list(ft_model, X_test_token)

100%|██████████| 22348/22348 [00:04<00:00, 4529.82it/s]
100%|██████████| 50/50 [00:00<00:00, 2944.28it/s]


In [None]:
print(datetime.datetime.now().strftime('%d-%B-%Y %H:%M:%S'))

10-March-2023 09:13:34


# Сохранение полученных признаков в pickle

In [12]:
def save_pickle(obj, filename: str) -> None:
    pickle.dump(obj, file=open('dumps/' + filename, mode="wb"))
    return

# FastText
save_pickle(X_train_ft_pre_trained, 'X_train_ft_pre_trained' + '.pkl')
save_pickle(X_test_ft_pre_trained, 'X_test_ft_pre_trained' + '.pkl')
save_pickle(X_train_ft_self_trained, 'X_train_ft_self_trained' + '.pkl')
save_pickle(X_test_ft_self_trained, 'X_test_ft_self_trained' + '.pkl')

# Word2Vec
save_pickle(X_train_w2v_pre_trained, 'X_train_w2v_pre_trained' + '.pkl')
save_pickle(X_test_w2v_pre_trained, 'X_test_w2v_pre_trained' + '.pkl')
save_pickle(X_train_w2v_self_trained, 'X_train_w2v_self_trained' + '.pkl')
save_pickle(X_test_w2v_self_trained, 'X_test_w2v_self_trained' + '.pkl')

# Обучение нейронных сетей

Воспроизводимость результатов

In [13]:
lightning.seed_everything(random_state, workers=True)

Global seed set to 42


42

Сетка перебираемых гиперпараметров

In [14]:
param_grid = {
    'embeddings': ['w2v_pretrained', 'ft_pretrained', 'w2v_selftrained', 'ft_selftrained'],
    'layers_count': [2, 3, 4],
    'learning_rate': [0.01, 0.001],
    'layers_type': ['different', 'equal'],
    'batch_size': [8, 64],
    'optimizer': [torch.optim.SGD, torch.optim.AdamW],
    'epochs': [5, 10],
    'activation_function': [torch.nn.functional.relu],
}

params_list = sklearn.model_selection.ParameterGrid(param_grid)
len(params_list)

384

Класс для работы с данными

In [15]:
class NetData(lightning.LightningDataModule):
    def __init__(self, train_features=None, test_features=None, train_targets=None, test_targets=None, batch_size=None, random_state=None):
        super().__init__()
        
        self.batch_size = batch_size
        self.X_train, self.X_val, self.y_train, self.y_val = sklearn.model_selection.train_test_split(train_features, train_targets, random_state=random_state)
        self.X_test, self.y_test = test_features, test_targets

    def setup(self, stage=None):
        features_train = torch.tensor(self.X_train, dtype=torch.float32)
        targets_train = torch.tensor(self.y_train, dtype=torch.int32)
    
        features_val = torch.tensor(self.X_val, dtype=torch.float32)
        targets_val = torch.tensor(self.y_val, dtype=torch.int32)
    
        features_test = torch.tensor(self.X_test, dtype=torch.float32)
        targets_test = torch.tensor(self.y_test, dtype=torch.int32)
    
        self.trainset = torch.utils.data.TensorDataset(features_train, targets_train)
        self.valset = torch.utils.data.TensorDataset(features_val, targets_val)
        self.testset = torch.utils.data.TensorDataset(features_test, targets_test)
        
    def train_dataloader(self):
        return torch.utils.data.DataLoader(self.trainset, batch_size=self.batch_size)
    def val_dataloader(self):
        return torch.utils.data.DataLoader(self.valset, batch_size=self.batch_size)
    def test_dataloader(self):
        return torch.utils.data.DataLoader(self.testset, batch_size=self.batch_size)

Нейронная сеть

In [16]:
#Архитектура нейронной сети
class Net(lightning.LightningModule):
    def __init__(self,
                 learning_rate=None,
                 layers_count=None,
                 layers_type=None,
                 optimizer=None,
                 activation=None
                 ):
        
        super().__init__()  #вх. #вых.
        self.layers = torch.nn.ModuleList()
        input_dim = 2**(2 + layers_count)
        self.layers.append(torch.nn.Linear(300, input_dim))
        for _ in range(layers_count):
            if layers_type == 'equal':
                self.layers.append( torch.nn.Linear(input_dim, input_dim) )
            elif layers_type == "different":
                self.layers.append( torch.nn.Linear(input_dim, input_dim // 2) )
                input_dim //= 2
        self.layers.append(torch.nn.Linear(input_dim, 2))

        self.activation = activation

        self.learning_rate = learning_rate
        self.optimizer = optimizer
        self.f1 = torchmetrics.classification.F1Score(task='multiclass', num_classes=2, multidim_average='global', average='weighted')

    def forward(self, x):
        for layer in self.layers[:-1]:
            x = self.activation(layer(x))
        x = self.layers[-1](x)
        return torch.nn.functional.log_softmax(x, dim=-1)
    
    def configure_optimizers(self):
        optimizer = self.optimizer(self.parameters(), lr=self.learning_rate)
        return optimizer
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = torch.nn.functional.cross_entropy(y_hat, y.long())
        self.log('loss', loss, on_epoch=True)
        self.log("train_f1", self.f1(y_hat, y), on_epoch=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        pred = self(x)
        self.log("val_f1", self.f1(pred, y), on_epoch=True)

    def test_step(self, batch, batch_idx):
        x, y = batch
        pred = self(x)
        self.log("test_f1", self.f1(pred, y), on_epoch=True)

In [17]:
total = []

for params in tqdm(params_list): 

    embeddings = params['embeddings']
    learningRate = params['learning_rate']
    optimizer_type = params['optimizer']
    layers_count = params['layers_count']
    batch_size = params['batch_size']
    epochs = params['epochs']
    activation_function = params['activation_function']
    layers_type = params['layers_type']


    net = Net(learning_rate=learningRate,
          layers_count=layers_count,
          layers_type=layers_type,
          optimizer=optimizer_type,
          activation=activation_function
        )
    
    dm = None
    if embeddings == 'w2v_pretrained':
        dm = NetData(
            train_features=X_train_w2v_pre_trained,
            test_features=X_test_w2v_pre_trained,
            train_targets=y_train,
            test_targets=y_test,
            batch_size=batch_size,
            random_state=random_state
        )
    elif embeddings == 'w2v_selftrained':
        dm = NetData(
            train_features=X_train_w2v_self_trained,
            test_features=X_test_w2v_self_trained,
            train_targets=y_train,
            test_targets=y_test,
            batch_size=batch_size,
            random_state=random_state
        )
    elif embeddings == 'ft_pretrained':
        dm = NetData(
            train_features=X_train_ft_pre_trained,
            test_features=X_test_ft_pre_trained,
            train_targets=y_train,
            test_targets=y_test,
            batch_size=batch_size,
            random_state=random_state
        )
    elif embeddings == 'ft_selftrained':
        dm = NetData(
            train_features=X_train_ft_self_trained,
            test_features=X_test_ft_self_trained,
            train_targets=y_train,
            test_targets=y_test,
            batch_size=batch_size,
            random_state=random_state
        )
    

    # CPU is better for FCNN than GPU
    trainer = lightning.Trainer(logger=False, max_epochs=epochs, enable_progress_bar=True, deterministic=True, inference_mode=True, enable_checkpointing=False, accelerator='cpu')

    trainer.fit(net, datamodule=dm)
    f1_train = trainer.logged_metrics['train_f1_epoch'].item()
    f1_val = trainer.logged_metrics['val_f1'].item()
    f1_test = trainer.test(net, datamodule=dm)[-1]['test_f1']    
    
    
    total.append({
        'embeddings': embeddings,
        'Layers type': layers_type,
        'Optimizer': optimizer_type.__name__,
        'Batch': batch_size,
        'Count of layers': layers_count,
        'Epochs': epochs,
        'Learning rate': learningRate,
        'Activation function': activation_function.__name__,
        'F1-train': round(f1_train, 4),
        'F1-val': round(f1_val, 4),
        'F1-test': round(f1_test, 4)
    })
    
    IPython.display.clear_output(wait=True) 

100%|██████████| 384/384 [12:44:42<00:00, 119.49s/it]


# Сводная таблица

In [18]:
pd.set_option('display.max_rows', None)
summary = pd.DataFrame.from_dict(total)
summary_sort = summary.sort_values(by='F1-val', ascending=False)
summary_sort

Unnamed: 0,embeddings,Layers type,Optimizer,Batch,Count of layers,Epochs,Learning rate,Activation function,F1-train,F1-val,F1-test
367,ft_selftrained,equal,AdamW,64,2,10,0.001,relu,0.7002,0.6935,0.64
171,ft_selftrained,different,AdamW,8,2,10,0.001,relu,0.7255,0.6912,0.5671
351,ft_selftrained,equal,AdamW,64,3,5,0.001,relu,0.6943,0.6876,0.6
175,ft_selftrained,equal,AdamW,8,2,10,0.001,relu,0.7269,0.6869,0.6303
151,ft_selftrained,equal,AdamW,8,2,5,0.001,relu,0.6934,0.6851,0.6395
355,ft_selftrained,different,AdamW,64,4,5,0.001,relu,0.6871,0.6845,0.6
155,ft_selftrained,different,AdamW,8,3,5,0.001,relu,0.7011,0.684,0.5972
347,ft_selftrained,different,AdamW,64,3,5,0.001,relu,0.687,0.684,0.5805
339,ft_selftrained,different,AdamW,64,2,5,0.001,relu,0.6898,0.684,0.6006
363,ft_selftrained,different,AdamW,64,2,10,0.001,relu,0.6946,0.6834,0.6604


# Выводы

Эксперименты показали высокую точность нейросетей. Лучший результат был достигнут на самостоятельно обученной FastText модели на 4 слойной сети с одинаковыми слоями по 64 нейрона в каждом с показателем f1-weighted = 0.84. Обучение выполнялось с batch_size = 1, SDG и 10 эпохах. В целом, модели Word2Vec показали себя более устойчивыми к обучаемости и в среднем результат показан получше, чем у FastText.