# Начальная инициализация

In [1]:
import datetime
print(datetime.datetime.now().strftime('%d-%B-%Y %H:%M:%S'))

18-March-2023 10:39:24


In [2]:
#импорт библиотек
# !pip install torch torchvision torchaudio
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data_utils
# !pip install scikit-learn
from sklearn.metrics import classification_report, f1_score
import torch.optim as optim
# !pip install pandas
import pandas as pd
import numpy as np

# import re
# !pip install nltk
# import nltk
# nltk.download('punkt')
# from string import punctuation

# %pip install gensim
import gensim
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split

from gensim.models import FastText

EPOCHS = 10

In [3]:
# Подключение вычислений на видеокарту, если доступна
if torch.cuda.is_available():
    dev = "cuda:0"
else:
    dev = "cpu"
device = torch.device(dev)
# device = "cpu"

In [4]:
print(device)

cuda:0


# Модели и методы для обучения и тестирования

In [5]:
class Net_3_layer(nn.Module):
    def __init__(self, layers = None, func_activation = None, weight_init = None, normalization = False, dropout = False):
        super().__init__()  #вх. #вых.
        self.fc1 = nn.Linear(300, layers[0])
        self.fc2 = nn.Linear(layers[0], layers[1])
        self.fc3 = nn.Linear(layers[1], layers[2])
        self.fc4 = nn.Linear(layers[2], layers[3])
        self.fc5 = nn.Linear(layers[3], 2)

        if weight_init == 'kaiming':
            torch.nn.init.kaiming_uniform_(self.fc1.weight)
        elif weight_init == 'xavier':
            torch.nn.init.xavier_uniform_(self.fc1.weight)
        else:
            pass
            #raise Exception('This initialization is not present yet')
            
        
        self.activate = func_activation
        
        if normalization == True:
            self.norm = nn.LayerNorm(layers[0])
        
        if dropout == True:
            self.dropout = nn.Dropout(0.2)
        
    def forward(self, x):
        if normalization == True:
            x = self.activate(self.norm(self.fc1(x)))
        else:
            x = self.activate(self.fc1(x))
        x = self.activate(self.fc2(x))
        x = self.activate(self.fc3(x))
        x = self.activate(self.fc4(x))
        x = self.fc5(x)
        return F.log_softmax(x, dim=1)

In [6]:
def train_net(
            net = None,
            x = None, 
            y = None, 
            batch = None, 
            epochs = None, 
            device = None, 
            scheduler = None,
            optimizer = None
    ):
    net = net.to(device)

    optimizer = optimizer
    
    scheduler = scheduler
    
    inputs_train = torch.tensor(x).to(device)
    targets_train = torch.tensor(y).int().to(device)

    train = data_utils.TensorDataset(inputs_train, targets_train)

    trainset = torch.utils.data.DataLoader(train, shuffle=True, batch_size=batch)

    print('-'*30)

    for epoch in range(epochs):
        loss = None
        for data in trainset:
            X, Y = data[0].to(device), data[1].to(device)
            net.zero_grad()
            output = net(X.float())
            loss = F.cross_entropy(output, Y.long())
            loss.backward()
            optimizer.step()
            scheduler.step()
        print(loss)
    
    
    return net

In [7]:
def test_net(net = None, device = None, x = None, y = None, batch = None):
    inputs_test = torch.tensor(x).to(device)
    targets_test = torch.tensor(y).int().to(device)
    
    test = data_utils.TensorDataset(inputs_test, targets_test)

    testset = torch.utils.data.DataLoader(test, shuffle=False, batch_size=batch)

    ams = []
    with torch.no_grad():
        for data in testset:
            X, Y = data[0].to(device), data[1].to(device)

            output = net(X.float())
            for idx, i in enumerate(output):
                ams.append(torch.argmax(i).item())
    return f1_score(y, ams, average='weighted')

In [8]:
def word_averaging(model, words):
    all_words, mean = set(), []

    for word in words:
        mean.append(model.wv[word])
        all_words.add(word)

    if not mean:
        return np.zeros(model.vector_size)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)

    return mean


def word_averaging_list(model, text_list):
    return np.vstack([word_averaging(model, comment_text) for comment_text in text_list])

In [9]:
def word_averaging_pre_trained(model, words):
  all_words, mean = set(), []

  for word in words:
    if word in model.key_to_index:
      mean.append(model[model.key_to_index[word]])
      all_words.add(word)

  if not mean:
    return np.zeros(model.vector_size)

  mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)

  return mean


def word_averaging_list_pre_trained(model, text_list):
  return np.vstack([word_averaging_pre_trained(model, comment_text) for comment_text in text_list])

In [10]:
print(datetime.datetime.now().strftime('%d-%B-%Y %H:%M:%S'))

18-March-2023 10:39:28


# Предобработка текста

In [11]:
train = pd.read_csv('X_y_train.csv', sep=';')
test = pd.read_csv('X_y_test.csv', sep=';')

X_train = [el[0] for el in train[['Text']].values]
X_test = [el[0] for el in test[['Text']].values]
y_train = [el[0] for el in train[['Class']].replace(-1, 0).values]
y_test = [el[0] for el in test[['Class']].replace(-1, 0).values]

In [12]:
# Create a X_token file one time!

# def tokenize(text):

#   text_token = nltk.word_tokenize(text)
#   text_word = [el.lower() for el in text_token if el not in punctuation]
#   return text_word

# X_train_token = [tokenize(t) for t in X_train]
# X_test_token = [tokenize(t) for t in X_test]



In [14]:
#Create a X_token file one time!

# def tokenize(text):

#   text_token = nltk.word_tokenize(text)
#   text_word = [el.lower() for el in text_token if el not in punctuation]
#   return text_word

# X_train_token = [tokenize(t) for t in X_train]
# X_test_token = [tokenize(t) for t in X_test]

In [15]:
# with open('X_train_token.txt', mode='w+') as file:
#     for sentence in X_train_token:
#         print(*sentence, file=file)

In [16]:
X_token = [sentence.split() for sentence in open('X_token.txt', mode='r')]
X_train_token = [sentence.split() for sentence in open('X_train_token.txt', mode='r')]
X_test_token = [sentence.split() for sentence in open('X_test_token.txt', mode='r')]

Функции для работы предобученных моделей RusVectores

In [17]:
import sys
import os
# !pip install wget
import wget
import re
# !pip install ufal.udpipe
from ufal.udpipe import Model, Pipeline

"""
Этот скрипт принимает на вход необработанный русский текст 
(одно предложение на строку или один абзац на строку).
Он токенизируется, лемматизируется и размечается по частям речи с использованием UDPipe.
На выход подаётся последовательность разделенных пробелами лемм с частями речи 
("зеленый_ADJ трамвай_NOUN").
Их можно непосредственно использовать в моделях с RusVectōrēs (https://rusvectores.org).

Примеры запуска:
echo 'Мама мыла раму.' | python3 rus_preprocessing_udpipe.py
zcat large_corpus.txt.gz | python3 rus_preprocessing_udpipe.py | gzip > processed_corpus.txt.gz
"""


def num_replace(word):
    newtoken = "x" * len(word)
    return newtoken


def clean_token(token, misc):
    """
    :param token:  токен (строка)
    :param misc:  содержимое поля "MISC" в CONLLU (строка)
    :return: очищенный токен (строка)
    """
    out_token = token.strip().replace(" ", "")
    if token == "Файл" and "SpaceAfter=No" in misc:
        return None
    return out_token


def clean_lemma(lemma, pos):
    """
    :param lemma: лемма (строка)
    :param pos: часть речи (строка)
    :return: очищенная лемма (строка)
    """
    out_lemma = lemma.strip().replace(" ", "").replace("_", "").lower()
    if "|" in out_lemma or out_lemma.endswith(".jpg") or out_lemma.endswith(".png"):
        return None
    if pos != "PUNCT":
        if out_lemma.startswith("«") or out_lemma.startswith("»"):
            out_lemma = "".join(out_lemma[1:])
        if out_lemma.endswith("«") or out_lemma.endswith("»"):
            out_lemma = "".join(out_lemma[:-1])
        if (
            out_lemma.endswith("!")
            or out_lemma.endswith("?")
            or out_lemma.endswith(",")
            or out_lemma.endswith(".")
        ):
            out_lemma = "".join(out_lemma[:-1])
    return out_lemma


def list_replace(search, replacement, text):
    search = [el for el in search if el in text]
    for c in search:
        text = text.replace(c, replacement)
    return text


def unify_sym(text):  # принимает строку в юникоде
    text = list_replace(
        "\u00AB\u00BB\u2039\u203A\u201E\u201A\u201C\u201F\u2018\u201B\u201D\u2019",
        "\u0022",
        text,
    )

    text = list_replace(
        "\u2012\u2013\u2014\u2015\u203E\u0305\u00AF", "\u2003\u002D\u002D\u2003", text
    )

    text = list_replace("\u2010\u2011", "\u002D", text)

    text = list_replace(
        "\u2000\u2001\u2002\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u200B\u202F\u205F\u2060\u3000",
        "\u2002",
        text,
    )

    text = re.sub("\u2003\u2003", "\u2003", text)
    text = re.sub("\t\t", "\t", text)

    text = list_replace(
        "\u02CC\u0307\u0323\u2022\u2023\u2043\u204C\u204D\u2219\u25E6\u00B7\u00D7\u22C5\u2219\u2062",
        ".",
        text,
    )

    text = list_replace("\u2217", "\u002A", text)

    text = list_replace("…", "...", text)

    text = list_replace("\u2241\u224B\u2E2F\u0483", "\u223D", text)

    text = list_replace("\u00C4", "A", text)  # латинская
    text = list_replace("\u00E4", "a", text)
    text = list_replace("\u00CB", "E", text)
    text = list_replace("\u00EB", "e", text)
    text = list_replace("\u1E26", "H", text)
    text = list_replace("\u1E27", "h", text)
    text = list_replace("\u00CF", "I", text)
    text = list_replace("\u00EF", "i", text)
    text = list_replace("\u00D6", "O", text)
    text = list_replace("\u00F6", "o", text)
    text = list_replace("\u00DC", "U", text)
    text = list_replace("\u00FC", "u", text)
    text = list_replace("\u0178", "Y", text)
    text = list_replace("\u00FF", "y", text)
    text = list_replace("\u00DF", "s", text)
    text = list_replace("\u1E9E", "S", text)

    currencies = list(
        "\u20BD\u0024\u00A3\u20A4\u20AC\u20AA\u2133\u20BE\u00A2\u058F\u0BF9\u20BC\u20A1\u20A0\u20B4\u20A7\u20B0\u20BF\u20A3\u060B\u0E3F\u20A9\u20B4\u20B2\u0192\u20AB\u00A5\u20AD\u20A1\u20BA\u20A6\u20B1\uFDFC\u17DB\u20B9\u20A8\u20B5\u09F3\u20B8\u20AE\u0192"
    )

    alphabet = list(
        '\t\n\r абвгдеёзжийклмнопрстуфхцчшщьыъэюяАБВГДЕЁЗЖИЙКЛМНОПРСТУФХЦЧШЩЬЫЪЭЮЯ,.[]{}()=+-−*&^%$#@!?~;:0123456789§/\|"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ '
    )

    alphabet.append("'")

    allowed = set(currencies + alphabet)

    cleaned_text = [sym for sym in text if sym in allowed]
    cleaned_text = "".join(cleaned_text)

    return cleaned_text


def process(pipeline, text="Строка", keep_pos=True, keep_punct=False):
    # Если частеречные тэги не нужны (например, их нет в модели), выставьте pos=False
    # в этом случае на выход будут поданы только леммы
    # По умолчанию знаки пунктуации вырезаются. Чтобы сохранить их, выставьте punct=True

    entities = {"PROPN"}
    named = False
    memory = []
    mem_case = None
    mem_number = None
    tagged_propn = []

    # обрабатываем текст, получаем результат в формате conllu:
    processed = pipeline.process(text)

    # пропускаем строки со служебной информацией:
    content = [line for line in processed.split("\n") if not line.startswith("#")]

    # извлекаем из обработанного текста леммы, тэги и морфологические характеристики
    tagged = [w.split("\t") for w in content if w]

    for t in tagged:
        if len(t) != 10:
            continue
        (word_id, token, lemma, pos, xpos, feats, head, deprel, deps, misc) = t
        token = clean_token(token, misc)
        lemma = clean_lemma(lemma, pos)
        if not lemma or not token:
            continue
        if pos in entities:
            if "|" not in feats:
                tagged_propn.append("%s_%s" % (lemma, pos))
                continue
            morph = {el.split("=")[0]: el.split("=")[1] for el in feats.split("|")}
            if "Case" not in morph or "Number" not in morph:
                tagged_propn.append("%s_%s" % (lemma, pos))
                continue
            if not named:
                named = True
                mem_case = morph["Case"]
                mem_number = morph["Number"]
            if morph["Case"] == mem_case and morph["Number"] == mem_number:
                memory.append(lemma)
                if "SpacesAfter=\\n" in misc or "SpacesAfter=\s\\n" in misc:
                    named = False
                    past_lemma = "::".join(memory)
                    memory = []
                    tagged_propn.append(past_lemma + "_PROPN")
            else:
                named = False
                past_lemma = "::".join(memory)
                memory = []
                tagged_propn.append(past_lemma + "_PROPN")
                tagged_propn.append("%s_%s" % (lemma, pos))
        else:
            if not named:
                if (
                    pos == "NUM" and token.isdigit()
                ):  # Заменяем числа на xxxxx той же длины
                    lemma = num_replace(token)
                tagged_propn.append("%s_%s" % (lemma, pos))
            else:
                named = False
                past_lemma = "::".join(memory)
                memory = []
                tagged_propn.append(past_lemma + "_PROPN")
                tagged_propn.append("%s_%s" % (lemma, pos))

    if not keep_punct:
        tagged_propn = [word for word in tagged_propn if word.split("_")[1] != "PUNCT"]
    if not keep_pos:
        tagged_propn = [word.split("_")[0] for word in tagged_propn]
    return tagged_propn


# URL of the UDPipe model
udpipe_model_url = "https://rusvectores.org/static/models/udpipe_syntagrus.model"
udpipe_filename = '' + udpipe_model_url.split("/")[-1]

if not os.path.isfile(udpipe_filename):
    print("UDPipe model not found. Downloading...", file=sys.stderr)
    wget.download(udpipe_model_url)

print("\nLoading the model...", file=sys.stderr)
model = Model.load(udpipe_filename)
process_pipeline = Pipeline(
    model, "tokenize", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu"
)


Loading the model...


In [18]:
print(datetime.datetime.now().strftime('%d-%B-%Y %H:%M:%S'))

18-March-2023 10:39:31


# Word2vec-признаки, обученные самостоятельно

## Обучение модели word2vec на полученном наборе данных

In [19]:
w2v_model = Word2Vec(sentences=X_token, vector_size=300, window=5, min_count=1, workers=4)

In [20]:
w2v_model.build_vocab(X_token)

In [21]:
w2v_model.train(X_token, total_examples=w2v_model.corpus_count, epochs=300, report_delay=1)
# w2v_model.init_sims(replace=True)

(78027793, 92066100)

In [22]:
vector = w2v_model.wv['работа']
# print(vector)
w2v_model.save('self-trained_word2vec/word2vec.model')

In [23]:
word_vectors = w2v_model.wv
X_train = word_averaging_list(w2v_model, X_train_token)
X_test = word_averaging_list(w2v_model, X_test_token)


In [24]:
print(datetime.datetime.now().strftime('%d-%B-%Y %H:%M:%S'))

18-March-2023 10:41:16


## Обучение нейронных сетей

In [25]:
f1_score_w2v_self_trained = []

for fa in [F.relu, F.leaky_relu]:
    for optimizer in [optim.SGD, optim.AdamW]:
        for initialization in ['kaiming', 'xavier', 'None']:
            for regularization in ['dropout', 'l2_reg', 'None']:
                for normalization in ['layerNorm', 'None']:
                    for schedulerType in ['ExponentialLR', 'MultiStepLR']:
                        for learningRate in [0.01, 0.001, 0.0001]:
                        
                            if normalization == 'layerNorm':
                                normType = True
                            else:
                                normType = False
                    
                            if regularization == 'dropout':
                                net = Net_3_layer(layers=[32, 16, 8, 4], func_activation=fa, weight_init=initialization, dropout = True, normalization = normType)
                                optimSet = optimizer(net.parameters(), lr=learningRate)
                            elif regularization == 'l2_reg':
                                net = Net_3_layer(layers=[32, 16, 8, 4], func_activation=fa, weight_init=initialization, normalization = normType)
                                optimSet = optimizer(net.parameters(), lr=learningRate, weight_decay=1e-4)
                            else:
                                net = Net_3_layer(layers=[32, 16, 8, 4], func_activation=fa, weight_init=initialization, normalization = normType)
                                optimSet = optimizer(net.parameters(), lr=learningRate)
                        
                            if schedulerType == 'ExponentialLR':
                                scheduler = optim.lr_scheduler.ExponentialLR(optimSet, gamma=0.8)
                            else:
                                scheduler = optim.lr_scheduler.MultiStepLR(optimSet, gamma=0.8, milestones=[1,3,5])
                        
                            net = train_net(net=net, x=X_train, y=y_train, batch=512, epochs=EPOCHS, device=device, optimizer=optimSet, scheduler=scheduler)
        
                            f1_score_w2v_self_trained.append(test_net(net=net, device=device, x=X_test, y=y_test, batch=1))

------------------------------
tensor(0.7289, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7153, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7003, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7225, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7130, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7217, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7082, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7249, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7082, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7146, device='cuda:0', grad_fn=<NllLossBackward0>)
------------------------------
tensor(0.7362, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7381, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7648, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7897, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7630, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7523, device='cuda:0', grad_f

In [26]:
print(datetime.datetime.now().strftime('%d-%B-%Y %H:%M:%S'))

18-March-2023 10:53:48


# FastText-признаки, обученные самостоятельно

## Обучение модели FastText на полученном наборе данных

In [27]:
ft_model = FastText(sentences=X_token, vector_size=300, window=5, min_count=1, workers=4)

In [28]:
ft_model.build_vocab(X_token)

In [29]:
ft_model.train(X_token, total_examples=ft_model.corpus_count, epochs=300, report_delay=1)
# ft_model.init_sims(replace=True)

(78024388, 92066100)

In [30]:
vector = ft_model.wv['работа']
# print(vector)
ft_model.save('self-trained_fasttext/fasttext.model')

In [31]:
word_vectors = ft_model.wv
X_train = word_averaging_list(ft_model, X_train_token)
X_test = word_averaging_list(ft_model, X_test_token)

In [32]:
print(datetime.datetime.now().strftime('%d-%B-%Y %H:%M:%S'))

18-March-2023 10:59:59


## Обучение нейронных сетей

In [33]:
f1_score_ft_self_trained = []

for fa in [F.relu, F.leaky_relu]:
    for optimizer in [optim.SGD, optim.AdamW]:
        for initialization in ['kaiming', 'xavier', 'None']:
            for regularization in ['dropout', 'l2_reg', 'None']:
                for normalization in ['layerNorm', 'None']:
                    for schedulerType in ['ExponentialLR', 'MultiStepLR']:
                        for learningRate in [0.01, 0.001, 0.0001]:
                        
                            if normalization == 'layerNorm':
                                normType = True
                            else:
                                normType = False
                    
                            if regularization == 'dropout':
                                net = Net_3_layer(layers=[32, 16, 8, 4], func_activation=fa, weight_init=initialization, dropout = True, normalization = normType)
                                optimSet = optimizer(net.parameters(), lr=learningRate)
                            elif regularization == 'l2_reg':
                                net = Net_3_layer(layers=[32, 16, 8, 4], func_activation=fa, weight_init=initialization, normalization = normType)
                                optimSet = optimizer(net.parameters(), lr=learningRate, weight_decay=1e-4)
                            else:
                                net = Net_3_layer(layers=[32, 16, 8, 4], func_activation=fa, weight_init=initialization, normalization = normType)
                                optimSet = optimizer(net.parameters(), lr=learningRate)
                        
                            if schedulerType == 'ExponentialLR':
                                scheduler = optim.lr_scheduler.ExponentialLR(optimSet, gamma=0.8)
                            else:
                                scheduler = optim.lr_scheduler.MultiStepLR(optimSet, gamma=0.8, milestones=[1,3,5])
                        
                            net = train_net(net=net, x=X_train, y=y_train, batch=512, epochs=EPOCHS, device=device, optimizer=optimSet, scheduler=scheduler)
        
                            f1_score_ft_self_trained.append(test_net(net=net, device=device, x=X_test, y=y_test, batch=1))

------------------------------
tensor(0.6940, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6949, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6993, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7047, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7025, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6958, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6981, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6964, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6984, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6984, device='cuda:0', grad_fn=<NllLossBackward0>)
------------------------------
tensor(0.6942, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6899, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6796, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6796, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6870, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.6827, device='cuda:0', grad_f

In [34]:
print(datetime.datetime.now().strftime('%d-%B-%Y %H:%M:%S'))

18-March-2023 11:11:14


# Word2Vec-признаки предобученные

## Предобработка текста

In [35]:
train = pd.read_csv('X_y_train.csv', sep=';')
test = pd.read_csv('X_y_test.csv', sep=';')

print("Processing input...", file=sys.stderr)
X_train_token_pre_trained = []
for input_line in train['Text']:
    res = unify_sym(input_line.strip())
    output = process(process_pipeline, text=res)
    X_train_token_pre_trained.append(output)


X_test_token_pre_trained = []
for input_line in test['Text']:
    res = unify_sym(input_line.strip())
    output = process(process_pipeline, text=res)
    X_test_token_pre_trained.append(output)

Processing input...


In [36]:
print(datetime.datetime.now().strftime('%d-%B-%Y %H:%M:%S'))

18-March-2023 11:12:51


## Обучение модели Word2Vec на полученном наборе данных

In [37]:
w2v_model = gensim.models.KeyedVectors.load_word2vec_format('word2vec/model.bin', encoding='utf-8', unicode_errors='ignore', binary=True)
w2v_model.fill_norms(force=True)

In [38]:
vector = w2v_model[w2v_model.key_to_index['работа_NOUN']]
print(vector)
# w2v_model.save('fasttext.model')

[-3.02568102e+00  4.10510159e+00  3.96478510e+00  1.99146974e+00
  2.75750041e-01  1.05907798e+00 -2.34842032e-01  1.13069057e+00
 -3.34521389e+00  5.44947767e+00 -1.78760219e+00  2.93239641e+00
  4.24604845e+00 -3.08241105e+00 -3.06380081e+00  4.14739418e+00
  1.94640350e+00 -6.41723156e+00  4.48100597e-01  1.98949501e-01
 -1.96533740e+00  2.04884505e+00  6.81714356e-01  1.89991868e+00
 -1.87503803e+00  1.61289966e+00  8.80351245e-01  1.27756655e+00
 -1.60905108e-01 -2.56419826e+00  5.59642196e-01 -2.38538122e+00
  5.61529458e-01 -3.78697932e-01 -5.17279339e+00 -4.75222081e-01
 -7.38774776e-01 -1.47579443e+00  5.36987162e+00  1.66592920e+00
  2.48067904e+00 -3.41140532e+00 -3.26146185e-01  1.99449539e+00
  1.23095262e+00  1.58154178e+00  1.74632573e+00  1.36116230e+00
 -2.66357565e+00  6.55228496e-01  2.66324496e+00 -2.19835329e+00
 -7.47147948e-02  2.37323642e+00 -2.46333432e+00  2.09079415e-01
 -2.15091515e+00 -5.49663115e+00 -8.21452737e-01  1.57285839e-01
  6.61701798e-01 -3.56726

In [39]:
# word_vectors = w2v_model[:]
X_train = word_averaging_list_pre_trained(w2v_model, X_train_token_pre_trained)
X_test = word_averaging_list_pre_trained(w2v_model, X_test_token_pre_trained)

In [40]:
print(datetime.datetime.now().strftime('%d-%B-%Y %H:%M:%S'))

18-March-2023 11:12:55


## Обучение нейронных сетей

In [41]:
f1_score_w2v_pre_trained = []

for fa in [F.relu, F.leaky_relu]:
    for optimizer in [optim.SGD, optim.AdamW]:
        for initialization in ['kaiming', 'xavier', 'None']:
            for regularization in ['dropout', 'l2_reg', 'None']:
                for normalization in ['layerNorm', 'None']:
                    for schedulerType in ['ExponentialLR', 'MultiStepLR']:
                        for learningRate in [0.01, 0.001, 0.0001]:
                        
                            if normalization == 'layerNorm':
                                normType = True
                            else:
                                normType = False
                    
                            if regularization == 'dropout':
                                net = Net_3_layer(layers=[32, 16, 8, 4], func_activation=fa, weight_init=initialization, dropout = True, normalization = normType)
                                optimSet = optimizer(net.parameters(), lr=learningRate)
                            elif regularization == 'l2_reg':
                                net = Net_3_layer(layers=[32, 16, 8, 4], func_activation=fa, weight_init=initialization, normalization = normType)
                                optimSet = optimizer(net.parameters(), lr=learningRate, weight_decay=1e-4)
                            else:
                                net = Net_3_layer(layers=[32, 16, 8, 4], func_activation=fa, weight_init=initialization, normalization = normType)
                                optimSet = optimizer(net.parameters(), lr=learningRate)
                        
                            if schedulerType == 'ExponentialLR':
                                scheduler = optim.lr_scheduler.ExponentialLR(optimSet, gamma=0.8)
                            else:
                                scheduler = optim.lr_scheduler.MultiStepLR(optimSet, gamma=0.8, milestones=[1,3,5])
                        
                            net = train_net(net=net, x=X_train, y=y_train, batch=512, epochs=EPOCHS, device=device, optimizer=optimSet, scheduler=scheduler)
        
                            f1_score_w2v_pre_trained.append(test_net(net=net, device=device, x=X_test, y=y_test, batch=1))

------------------------------
tensor(0.7402, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7513, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7307, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7339, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7625, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7275, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7577, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7402, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7815, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7450, device='cuda:0', grad_fn=<NllLossBackward0>)
------------------------------
tensor(0.7562, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7577, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7320, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7638, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7305, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7471, device='cuda:0', grad_f

In [42]:
print(datetime.datetime.now().strftime('%d-%B-%Y %H:%M:%S'))

18-March-2023 11:24:23


# FastText-признаки предобученные

## Предобработка текста

In [43]:
train = pd.read_csv('X_y_train.csv', sep=';')
test = pd.read_csv('X_y_test.csv', sep=';')

print("Processing input...", file=sys.stderr)
X_train_token_pre_trained = []
for input_line in train['Text']:
    res = unify_sym(input_line.strip())
    output = process(process_pipeline, text=res, keep_pos=False)
    X_train_token_pre_trained.append(output)


X_test_token_pre_trained = []
for input_line in test['Text']:
    res = unify_sym(input_line.strip())
    output = process(process_pipeline, text=res, keep_pos=False)
    X_test_token_pre_trained.append(output)

Processing input...


In [44]:
print(datetime.datetime.now().strftime('%d-%B-%Y %H:%M:%S'))

18-March-2023 11:26:14


## Обучение модели FastText на полученном наборе данных

In [45]:
ft_model = gensim.models.KeyedVectors.load('fasttext/model.model')
ft_model.fill_norms(force=True)

In [46]:
vector = ft_model[ft_model.key_to_index['работа']]
print(vector)
# w2v_model.save('fasttext.model')

[-1.28143653e-01  6.31993353e-01  3.97833914e-01 -1.69036105e-01
  1.94467843e-01  2.62616009e-01 -1.29946560e-01  2.99574584e-01
  5.06506674e-02  5.92105603e-03  8.87992755e-02  4.69435602e-01
 -2.00550243e-01  6.64121136e-02  1.52241021e-01 -3.72883081e-01
 -3.08863670e-02 -1.04671396e-01  1.48081467e-01 -8.00064430e-02
  7.97154009e-02 -1.13625549e-01 -3.49553585e-01 -1.42057359e-01
  7.67205238e-01 -1.68227687e-01 -1.12291731e-01  3.17529649e-01
 -3.59128028e-01 -6.41788542e-02 -5.57220131e-02  1.65108591e-01
  1.66394070e-01  5.43315411e-01 -1.59994856e-01  2.17812255e-01
  2.80040950e-01 -5.70807010e-02  1.74972832e-01  1.98939666e-01
  1.93150759e-01  2.96664566e-01  7.72891268e-02  5.42501509e-01
  2.76810322e-02 -1.13098420e-01  1.08400442e-01  6.68987632e-03
 -1.36405602e-01 -9.27114263e-02 -5.01722358e-02 -3.12990457e-01
  7.97539577e-02  3.24938953e-01 -1.21801049e-01  2.70672590e-01
 -4.02754359e-02  3.37236971e-01 -4.76787239e-01 -1.47199020e-01
  8.80351439e-02 -2.75021

In [47]:
# word_vectors = w2v_model[:]
# X = word_averaging_list(ft_model, X_token)

X_train = word_averaging_list_pre_trained(ft_model, X_train_token_pre_trained)
X_test = word_averaging_list_pre_trained(ft_model, X_test_token_pre_trained)

In [48]:
# X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

## Обучение нейронных сетей

In [49]:
f1_score_ft_pre_trained = []
 
for fa in [F.relu, F.leaky_relu]:
    for optimizer in [optim.SGD, optim.AdamW]:
        for initialization in ['kaiming', 'xavier', 'None']:
            for regularization in ['dropout', 'l2_reg', 'None']:
                for normalization in ['layerNorm', 'None']:
                    for schedulerType in ['ExponentialLR', 'MultiStepLR']:
                        for learningRate in [0.01, 0.001, 0.0001]:
                        
                            if normalization == 'layerNorm':
                                normType = True
                            else:
                                normType = False
                    
                            if regularization == 'dropout':
                                net = Net_3_layer(layers=[32, 16, 8, 4], func_activation=fa, weight_init=initialization, dropout = True, normalization = normType)
                                optimSet = optimizer(net.parameters(), lr=learningRate)
                            elif regularization == 'l2_reg':
                                net = Net_3_layer(layers=[32, 16, 8, 4], func_activation=fa, weight_init=initialization, normalization = normType)
                                optimSet = optimizer(net.parameters(), lr=learningRate, weight_decay=1e-4)
                            else:
                                net = Net_3_layer(layers=[32, 16, 8, 4], func_activation=fa, weight_init=initialization, normalization = normType)
                                optimSet = optimizer(net.parameters(), lr=learningRate)
                        
                            if schedulerType == 'ExponentialLR':
                                scheduler = optim.lr_scheduler.ExponentialLR(optimSet, gamma=0.8)
                            else:
                                scheduler = optim.lr_scheduler.MultiStepLR(optimSet, gamma=0.8, milestones=[1,3,5])
                        
                            net = train_net(net=net, x=X_train, y=y_train, batch=512, epochs=EPOCHS, device=device, optimizer=optimSet, scheduler=scheduler)
        
                            f1_score_ft_pre_trained.append(test_net(net=net, device=device, x=X_test, y=y_test, batch=1))

------------------------------
tensor(0.7596, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7963, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7642, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8170, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7642, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8078, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7665, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7780, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7918, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7895, device='cuda:0', grad_fn=<NllLossBackward0>)
------------------------------
tensor(0.7825, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7583, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8044, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7583, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.7802, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor(0.8044, device='cuda:0', grad_f

In [50]:
print(datetime.datetime.now().strftime('%d-%B-%Y %H:%M:%S'))

18-March-2023 11:38:12


In [51]:
for cur in [f1_score_w2v_self_trained, f1_score_w2v_pre_trained, f1_score_ft_self_trained, f1_score_ft_pre_trained]:
    print(len(cur))

432
432
432
432


# Сводная таблица

In [52]:
# solid, 3, 6, 12, 36, 108, 216

summary = pd.DataFrame()

summary['Вид признаков'] = np.array(['Word2vec'] * 864 + ['fastText'] * 864, dtype=str)
summary['Модель предобучена?'] = np.array((['Нет'] * 432 + ['Да'] * 432) * 2, dtype=str)

summary['Количество скрытых слоев'] = np.array([3] * 1728, dtype=int)
summary['Количество нейронов'] = np.array(['32-16-8'] * 1728, dtype=str)
summary['Размер батча'] = np.array(['512'] * 1728, dtype=int)

summary['Функция активации'] = np.array((['ReLU'] * 864 + ['LeakyReLU'] * 864), dtype=str)
summary['Оптимизатор'] = np.array((['SGD'] * 108 + ['AdamW'] * 108) * 8, dtype=str)
summary['Инициализация'] = np.array((['Kaiming'] * 36 + ['Xavier'] * 36 + ['None'] * 36) * 16, dtype=str)
summary['Регуляризация'] = np.array((['Dropout'] * 12 + ['L2'] * 12 + ['None'] * 12) * 48, dtype=str)
summary['Нормализация'] = np.array((['LayerNorm'] * 6 + ['None'] * 6) * 144, dtype=str)
summary['Настройки скорости обучения'] = np.array((['ExponentialLR'] * 3 + ['MultiStepLR'] * 3) * 288, dtype=str)
summary['Скорость обучения'] = np.array([0.01, 0.001, 0.0001] * 576, dtype=str)

summary['Weighted F1-score'] = np.array([round(el, 2) for el in f1_score_w2v_self_trained] \
                                        + [round(el, 2) for el in f1_score_w2v_pre_trained] \
                                        + [round(el, 2) for el in f1_score_ft_self_trained] \
                                        + [round(el, 2) for el in f1_score_ft_pre_trained], dtype=float)

summary_sort = summary.sort_values(by='Weighted F1-score', ascending=False)
summary_sort

Unnamed: 0,Вид признаков,Модель предобучена?,Количество скрытых слоев,Количество нейронов,Размер батча,Функция активации,Оптимизатор,Инициализация,Регуляризация,Нормализация,Настройки скорости обучения,Скорость обучения,Weighted F1-score
1287,fastText,Нет,3,32-16-8,512,LeakyReLU,AdamW,,,LayerNorm,MultiStepLR,0.01,0.88
1041,fastText,Нет,3,32-16-8,512,LeakyReLU,AdamW,Xavier,,,MultiStepLR,0.01,0.84
1191,fastText,Нет,3,32-16-8,512,LeakyReLU,AdamW,Kaiming,Dropout,LayerNorm,MultiStepLR,0.01,0.84
1035,fastText,Нет,3,32-16-8,512,LeakyReLU,AdamW,Xavier,,LayerNorm,MultiStepLR,0.01,0.84
994,fastText,Нет,3,32-16-8,512,LeakyReLU,AdamW,Kaiming,L2,,MultiStepLR,0.001,0.84
...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,fastText,Нет,3,32-16-8,512,LeakyReLU,SGD,Kaiming,,,ExponentialLR,0.001,0.29
896,fastText,Нет,3,32-16-8,512,LeakyReLU,SGD,Kaiming,,,ExponentialLR,0.0001,0.29
898,fastText,Нет,3,32-16-8,512,LeakyReLU,SGD,Kaiming,,,MultiStepLR,0.001,0.29
899,fastText,Нет,3,32-16-8,512,LeakyReLU,SGD,Kaiming,,,MultiStepLR,0.0001,0.29


# Выводы

Как можно увидеть по таблице - самого лучшего результата (0.84) достигла непредеробученная модель на признаках fastText с оптимизатором AdamW, функцией активации LeakyReLU, Dropout регуляризацией, "планировщиком" MultiStepLR с начальной скоростью 0.01 с LayerNorm нормализацией.