In [4]:
import numpy as np
import opencorpora
from collections import Counter
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, TimeDistributed

# Блок обучения НС

In [5]:
# Чтение .conllu с получением токенов и pos-тэгов к ним

def extract_sentences_and_pos_from_file(path):
    sentences = []
    pos_tags = []
    with open(path, 'r', encoding='utf-8') as f:
        current_sentence = []
        current_pos = []
        for line in f:
            line = line.strip()
            if line.startswith('#'):
                continue
            if not line:
                if current_sentence:
                    sentences.append(current_sentence)
                    pos_tags.append(current_pos)
                    current_sentence = []
                    current_pos = []
                continue
            parts = line.split('\t')
            if len(parts) >= 4:
                word = parts[1]  # слово
                pos = parts[3]   # часть речи
                current_sentence.append(word)
                current_pos.append(pos)
        if current_sentence:
            sentences.append(current_sentence)
            pos_tags.append(current_pos)
    return sentences, pos_tags
###########################################################################

sentences, pos_tags = extract_sentences_and_pos_from_file('ru_syntagrus-ud-train-b.conllu')

In [6]:
all_words = [word for sent in sentences for word in sent]
all_unique_tags = sorted(set(tag for tag_seq in pos_tags for tag in tag_seq))

word_counts = Counter(all_words)
MIN_FREQ = 2
vocab_words = [word for word, cnt in word_counts.items() if cnt >= MIN_FREQ]

word2idx = {"<PAD>": 0, "<UNK>": 1}
for i, word in enumerate(vocab_words, start=2):
    word2idx[word] = i

tag2idx = {tag: i for i, tag in enumerate(all_unique_tags)}
tag2idx["<PAD>"] = len(tag2idx)

In [7]:
def encode_sentences(sentences, tags, word2idx, tag2idx):
    X, Y = [], []
    unk_id = word2idx["<UNK>"]
    
    for sent, tag_seq in zip(sentences, tags):
        x = [word2idx.get(word, unk_id) for word in sent]
        y = [tag2idx[tag] for tag in tag_seq]
        X.append(x)
        Y.append(y)
    return X, Y

X, Y = encode_sentences(sentences, pos_tags, word2idx, tag2idx)

In [8]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len = 0
for sentence in sentences:
    if max_len < len(sentence):
        max_len = len(sentence)
    else:
        continue

MAX_LEN = max_len

X_padded = pad_sequences(X, maxlen=MAX_LEN, padding='post', value=word2idx["<PAD>"])
y_padded = pad_sequences(Y, maxlen=MAX_LEN, padding='post', value=tag2idx["<PAD>"])

In [9]:
X_final = np.array(X_padded)
y_final = np.array(y_padded).reshape(-1, MAX_LEN, 1)  # для sparse_categorical_crossentropy

In [10]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, TimeDistributed

vocab_size = len(word2idx)
num_tags = len(tag2idx)

inputs = Input(shape=(MAX_LEN,))
embed = Embedding(vocab_size, 100, mask_zero=True)(inputs)
bilstm = Bidirectional(LSTM(128, return_sequences=True))(embed)
outputs = TimeDistributed(Dense(num_tags, activation='softmax'))(bilstm)

model = Model(inputs, outputs)
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.fit(X_final, y_final, batch_size=32, epochs=10, validation_split=0.1)

I0000 00:00:1759694108.785012   50039 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 4778 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3070 Ti, pci bus id: 0000:01:00.0, compute capability: 8.6


Epoch 1/10


2025-10-05 22:55:14.559219: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91301


[1m684/684[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 29ms/step - accuracy: 0.0705 - loss: 0.6534 - val_accuracy: 0.0854 - val_loss: 0.2634
Epoch 2/10
[1m684/684[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 27ms/step - accuracy: 0.0843 - loss: 0.1535 - val_accuracy: 0.0873 - val_loss: 0.2122
Epoch 3/10
[1m684/684[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 26ms/step - accuracy: 0.0855 - loss: 0.1120 - val_accuracy: 0.0868 - val_loss: 0.2323
Epoch 4/10
[1m684/684[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 28ms/step - accuracy: 0.0863 - loss: 0.0867 - val_accuracy: 0.0870 - val_loss: 0.2323
Epoch 5/10
[1m684/684[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 28ms/step - accuracy: 0.0869 - loss: 0.0667 - val_accuracy: 0.0870 - val_loss: 0.2540
Epoch 6/10
[1m684/684[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 26ms/step - accuracy: 0.0874 - loss: 0.0509 - val_accuracy: 0.0865 - val_loss: 0.2902
Epoch 7/10
[1m684/684[0m 

<keras.src.callbacks.history.History at 0x7921f1ffcee0>

In [18]:
# Возьмём ваше предложение
test_sent = ['Он', 'любит', 'печь', 'блины', 'и', 'печь', 'блины']

# Преобразуем в ID
x_test = [word2idx.get(w, word2idx["<UNK>"]) for w in test_sent]
x_test = pad_sequences([x_test], maxlen=MAX_LEN, padding='post', value=word2idx["<PAD>"])

# Предсказание
pred = model.predict(x_test)
pred_ids = pred[0].argmax(axis=-1)

# Тогда создайте обратный словарь:
idx2tag = {idx: tag for tag, idx in tag2idx.items()}

# Обратно в теги
pred_tags = [idx2tag[idx] for idx in pred_ids[:len(test_sent)]]
print(list(zip(test_sent, pred_tags)))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[('Он', 'PRON'), ('любит', 'VERB'), ('печь', 'NOUN'), ('блины', 'NOUN'), ('и', 'CCONJ'), ('печь', 'NOUN'), ('блины', 'NOUN')]


# Блок лемматизации + POS-тэггинга

In [12]:
corpus = opencorpora.load('annot.opcorpora.xml')

OSError: Error reading file 'annot.opcorpora.xml': failed to load "annot.opcorpora.xml": No such file or directory

In [None]:
symbols_to_remove = [',', '.', '?', '!', '\n']

with open('text.txt', 'r') as file:
    text = file.read()

In [None]:
for char in symbols_to_remove:
    if char == '\n':
        text = text.replace(f'{char}', ' ')
    else:
        text = text.replace(f'{char}', '')

splitted_text = text.split()

In [None]:
list_of_tokens = [token.source for token in corpus.tokens]

In [6]:
%%time

counter = 0
word_lemma_gramema = []

for word in splitted_text:
    for p in range(len(list_of_tokens)):
        if word == list_of_tokens[p]:
            finded_token = corpus.tokens[p]
            word_lemma_gramema.append(f'{word}({finded_token.lemma}={finded_token.grammemes[0]})')
            break
        else:
            counter += 1
            if counter == len(list_of_tokens):
                word_lemma_gramema.append(f'{word}(не_нашел_лемму=не_нашел_грамемму)')   
    counter = 0

joined = ' '.join(word_lemma_gramema)

CPU times: user 25.7 s, sys: 1.21 s, total: 26.9 s
Wall time: 26.9 s


In [7]:
joined

'Стала(стал=VERB) стабильнее(не_нашел_лемму=не_нашел_грамемму) экономическая(экономический=ADJF) и(и=CONJ) политическая(политический=ADJF) обстановка(обстановка=NOUN) предприятия(предприятие=NOUN) вывели(вывел=VERB) из(из=PREP) тени(тень=NOUN) зарплаты(зарплата=NOUN) сотрудников(сотрудник=NOUN) Все(весь=ADJF) Гришины(гришин=ADJF) одноклассники(одноклассник=NOUN) уже(уже=ADVB) побывали(побывал=VERB) за(за=PREP) границей(граница=NOUN) он(он=NPRO) был(есть=VERB) чуть(чуть=ADVB) ли(ли=PRCL) не(не=PRCL) единственным(единственный=ADJF) кого(кто=NPRO) не(не=PRCL) вывозили(вывожу=VERB) никуда(никуда=ADVB) дальше(дальше=COMP) Красной(красный=ADJF) Пахры(не_нашел_лемму=не_нашел_грамемму)'