Импорт библиотек

In [27]:
import re
import os
import pickle
import numpy as np
import random
import torch
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from pymorphy2 import MorphAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import DataCollatorForLanguageModeling, BertTokenizer, BertForMaskedLM, Trainer, TrainingArguments, BertModel
from gensim.models import Word2Vec, FastText
from chardet.universaldetector import UniversalDetector
from datasets import Dataset

Загрузка  nltk данных

In [13]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nmens\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\nmens\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nmens\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Обработчик кодировки

In [None]:
DATA_DIR = "data"
FILES = [
    ["Biblioteka_prikluchenij.txt", "utf-8"]
    ["detective_for_kidds.txt", "utf-8"]
    ["detective_masters.txt", "windows-1251"]
    ["russian_love_story.txt", "utf-8"]
]

detector = UniversalDetector()

for file in FILES:
    with open(os.path.join(DATA_DIR, file), 'rb') as f:
        for line in f:
            detector.feed(line)
            if detector.done:
                break
        detector.close()
print(detector.result)

Обработчик текста

In [2]:
def has_english(s: str) -> bool:
    return any(c.isalpha() and c.isascii() for c in s)


def preprocess_text(
    text: str,
    language: str = 'russian',
    lemmatize: bool = True,
    remove_stopwords: bool = True,
    min_word_length: int = 2,
    extra_stopwords: list = None
) -> list:
    """
    Предобрабатывает текст для NLP-задач.
    
    Параметры:
        text (str): Исходный текст
        language (str): Язык текста ('russian' или 'english')
        lemmatize (bool): Применять лемматизацию
        remove_stopwords (bool): Удалять стоп-слова
        min_word_length (int): Минимальная длина сохраняемых слов
        extra_stopwords (list): Дополнительные стоп-слова
    
    Возвращает:
        list: Список обработанных токенов
    """
    try:
        # Инициализация инструментов
        morph = MorphAnalyzer() if lemmatize and language == 'russian' else None
        stop_words = set(stopwords.words(language)) if remove_stopwords else set()
        
        # Добавление дополнительных стоп-слов
        if extra_stopwords:
            stop_words.update(extra_stopwords)

        # Очистка текста
        text = text.lower()
        text = re.sub(r'[^a-zа-яё\s]', ' ', text, flags=re.IGNORECASE)  # Удаление пунктуации и цифр
        text = re.sub(r'\s+', ' ', text).strip()  # Удаление лишних пробелов

        # Токенизация
        tokens = word_tokenize(text, language=language)

        # Обработка токенов
        processed_tokens = []
        for token in tokens:
            # Удаление коротких слов
            if len(token) < min_word_length:
                continue
                
            # Лемматизация
            if lemmatize and morph:
                lemma = morph.parse(token)[0].normal_form
                token = lemma

            # Удаление стоп-слов
            if remove_stopwords and token in stop_words:
                continue
            
            if has_english(token):
                continue
                
            processed_tokens.append(token)

        return processed_tokens

    except LookupError as e:
        print(f"Ошибка: {e}. Проверьте установку необходимых ресурсов nltk.")
        return []
    except Exception as e:
        print(f"Неизвестная ошибка: {e}")
        return []

Загрузка текста

In [16]:
FILES = [
    ["Biblioteka_prikluchenij.txt", "utf-8"],
    ["detective_for_kidds.txt", "utf-8"],
    ["detective_masters.txt", "windows-1251"],
    ["russian_love_story.txt", "utf-8"]
]

DATA_DIR = "data"
texts = []

for filename, codec in FILES:
        path = os.path.join(DATA_DIR, filename)
        with open(path, 'r', encoding=codec) as f:
            texts.append(f.read())

Обработка текста

In [17]:
sentences = [preprocess_text(text) for text in texts]

Обучение моделей

In [None]:
word2vec_model = Word2Vec(
    sentences=sentences,
    vector_size=300,
    window=5,
    min_count=2,
    workers=4
)

fasttext_model = FastText(
    sentences=sentences,
    vector_size=300,
    window=5,
    min_count=2,
    workers=4
)

In [10]:
with open("data/preprocessed_data.pkl", "rb") as f:
    sentences = pickle.load(f)

In [17]:
len([' '.join(i) for i in sentences])

4

In [18]:
tfidf = TfidfVectorizer(
    min_df=3,           
    max_features=5000   
)

processed_texts = [' '.join(sentence) for sentence in sentences]
tfidf_matrix = tfidf.fit_transform(processed_texts)

tfidf_vec = {}
for word in tfidf.get_feature_names_out():
    idx = tfidf.vocabulary_[word]
    vector = tfidf_matrix[:, idx].toarray().flatten()
    tfidf_vec[word] = vector

Сохранение моделей

In [None]:
word2vec_model.save(os.path.join('models', 'word2vec.model'))
fasttext_model.save(os.path.join('models', 'fasttext.model'))

In [19]:
with open('tfidf_vec.pkl', 'wb') as f:
    pickle.dump(tfidf_vec, f)

Сохранение обработанных данных

In [20]:
with open("data/preprocessed_data.pkl", "wb") as f:
    pickle.dump(sentences, f)

Трансформеры (ruBERT и fine tuned). Загрузка берта

In [21]:
stock_model_name = 'DeepPavlov/rubert-base-cased'

os.makedirs('./stock_bert', exist_ok=True)
os.makedirs('./finetuned_bert', exist_ok=True)

stock_tokenizer = BertTokenizer.from_pretrained(stock_model_name)
stock_model = BertForMaskedLM.from_pretrained(stock_model_name)

stock_tokenizer.save_pretrained('./stock_bert')
stock_model.save_pretrained('./stock_bert', safe_serialization=True)

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


Файн тюнинг. Подготовка данных.

In [22]:
tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')

In [23]:
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        padding='max_length',
        truncation=True,
        max_length=64,
        return_special_tokens_mask=True
    )
    return tokenized


dataset = Dataset.from_dict({"text": [" ".join(text) for text in sentences]})
tokenized_dataset = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Сохранение токенизированных данных для Берт

In [24]:
with open("data/tokenized_dataset.pkl", "wb") as f:
    pickle.dump(tokenized_dataset, f)

Файн тюнинг. Обучение.

In [25]:
model = BertForMaskedLM.from_pretrained('DeepPavlov/rubert-base-cased')

with open("data/tokenized_dataset.pkl", "rb") as f:
    tokens = pickle.load(f)


training_args = TrainingArguments(
    output_dir='./bert-finetuned',
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=2,
    save_steps=100,
    save_total_limit=2,
    prediction_loss_only=True,
    learning_rate=5e-5,
    weight_decay=0.01
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokens,
)

trainer.train()

Step,Training Loss


TrainOutput(global_step=20, training_loss=4.736667633056641, metrics={'train_runtime': 49.1246, 'train_samples_per_second': 0.814, 'train_steps_per_second': 0.407, 'total_flos': 1317391488000.0, 'train_loss': 4.736667633056641, 'epoch': 10.0})

Сохранение Берта

In [26]:
model.save_pretrained('./finetuned_bert', safe_serialization=True)
tokenizer.save_pretrained('./finetuned_bert')

('./finetuned_bert\\tokenizer_config.json',
 './finetuned_bert\\special_tokens_map.json',
 './finetuned_bert\\vocab.txt',
 './finetuned_bert\\added_tokens.json')

Создание вложений

Подготовка и загрузка данных

In [None]:
dm_path = "data/adjectives100_ruscorp_add_ppmi_300.dm"
phrase_vectors = {}
data = {}
vectors = {'initial' : list(),
            'wor2vec' : list(),
            'stock_bert' : list(),
            'finetuned_bert' : list(),
            'tf_idf' : list(),
            'fasttext': list()}

with open(dm_path, 'r', encoding='utf-8') as f:
    for line in tqdm(f):
        parts = line.strip().split()
        phrase = parts[0]
        adjective, noun = phrase.split('_', 1)
        word = preprocess_text(noun)
        if not word:
            continue
        word = word[0]
        
        vector = list(map(float, parts[1:]))
        
        if not data.get(adjective, None):
            data[adjective] = dict()
        
        data[adjective][word] = vectors
        data[adjective][word]['initial'] = np.array(vector)

20419it [1:21:55,  4.15it/s] 


Создание вложений по моделям

Word2Vec

In [21]:
model = Word2Vec.load("models/word2vec.model")

for adj in tqdm(data.keys()):
    for word in data[adj].keys():
        try:
            vector = model.wv[word]
            vector = vector / np.linalg.norm(vector)
            data[adj][word]['wor2vec'] = vector
        except:
            print(f"Слово {word} не найдено")

100%|██████████| 99/99 [00:00<00:00, 883.73it/s]

Слово коф не найдено
Слово беспризорность не найдено
Слово млрд не найдено
Слово кейвинга не найдено
Слово спелеотуризм не найдено
Слово ество не найдено
Слово землеведение не найдено
Слово боев не найдено
Слово чуйка не найдено
Слово юечж не найдено
Слово токмак не найдено
Слово лиличка не найдено
Слово окный не найдено
Слово высокопочитание не найдено
Слово хабуг не найдено
Слово жиган не найдено
Слово рамень не найдено
Слово коф не найдено
Слово отесинька не найдено
Слово упк не найдено
Слово маргарит не найдено
Слово фазис не найдено
Слово уренгой не найдено
Слово парадигма не найдено
Слово коап не найдено
Слово леф не найдено
Слово репатриант не найдено
Слово миров не найдено
Слово стат не найдено
Слово лукий не найдено
Слово бох не найдено
Слово монархиня не найдено
Слово честие не найдено
Слово новагород не найдено
Слово радостие не найдено
Слово старчество не найдено
Слово песнотворец не найдено
Слово алфим не найдено
Слово вертереть не найдено
Слово моща не найдено
Слово экзер




Fasttext

In [22]:
fasttext_model = FastText.load("models/fasttext.model")

for adj in tqdm(data.keys()):
    for word in data[adj].keys():
        vector = fasttext_model.wv[word]
        vector = vector / np.linalg.norm(vector)
        data[adj][word]['fasttext'] = vector

100%|██████████| 99/99 [00:00<00:00, 749.83it/s]


TF-IDF

In [None]:
for adj in tqdm(data.keys()):
    for word in data[adj].keys():
        try:
            vector = tfidf_vec[word]
            vector = vector / np.linalg.norm(vector)
            data[adj][word]['tf_idf'] = vector
        except:
            print(f"Слово {word} не найдено")

ruBERT

In [30]:
def get_bert_embedding(word, tokenizer, model):
    inputs = tokenizer(
        word, 
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=32,
        add_special_tokens=False
    )
    
    with torch.no_grad():
        outputs = model(**inputs)

    embeddings = torch.mean(outputs.last_hidden_state, dim=1)
    return embeddings.numpy()[0]

In [28]:
stock_tokenizer = BertTokenizer.from_pretrained('stock_bert')
stock_model = BertModel.from_pretrained('stock_bert')

finetuned_tokenizer = BertTokenizer.from_pretrained('finetuned_bert')
finetuned_model = BertModel.from_pretrained('finetuned_bert')

Some weights of BertModel were not initialized from the model checkpoint at stock_bert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertModel were not initialized from the model checkpoint at finetuned_bert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
for adj in tqdm(data.keys()):
    for noun in data[adj].keys():
        try:
            stock_vec = get_bert_embedding(noun, stock_tokenizer, stock_model)
            stock_vec = stock_vec / np.linalg.norm(stock_vec)
            data[adj][noun]['stock_bert'] = stock_vec
            
            finetuned_vec = get_bert_embedding(noun, finetuned_tokenizer, finetuned_model)
            finetuned_vec = finetuned_vec / np.linalg.norm(finetuned_vec)
            data[adj][noun]['finetuned_bert'] = finetuned_vec
            
        except Exception as e:
            print(f"Ошибка для {noun}: {str(e)}")

100%|██████████| 99/99 [21:02<00:00, 12.75s/it]


Сохранение полученного словаря

In [34]:
with open("data/embeddings_dict.pkl", "wb") as f:
    pickle.dump(data, f)