In [31]:
import numpy as np
from collections import Counter

In [32]:
# Чтение .conllu с получением токенов и pos-тэгов к ним

def extract_sentences_and_pos_from_file(path):
    sentences = []
    pos_tags = []
    with open(path, 'r', encoding='utf-8') as f:
        current_sentence = []
        current_pos = []
        for line in f:
            line = line.strip()
            if line.startswith('#'):
                continue
            if not line:
                if current_sentence:
                    sentences.append(current_sentence)
                    pos_tags.append(current_pos)
                    current_sentence = []
                    current_pos = []
                continue
            parts = line.split('\t')
            if len(parts) >= 4:
                word = parts[1]  # слово
                pos = parts[3]   # часть речи
                current_sentence.append(word)
                current_pos.append(pos)
        if current_sentence:
            sentences.append(current_sentence)
            pos_tags.append(current_pos)
    return sentences, pos_tags
###########################################################################

sentences, pos_tags = extract_sentences_and_pos_from_file('ru_syntagrus-ud-train-b.conllu')

In [None]:
all_words = [word for sent in sentences for word in sent]
all_unique_tags = sorted(set(tag for tag_seq in pos_tags for tag in tag_seq))

word_counts = Counter(all_words)
MIN_FREQ = 2
vocab_words = [word for word, cnt in word_counts.items() if cnt >= MIN_FREQ]

word2idx = {"<PAD>": 0, "<UNK>": 1}
for i, word in enumerate(vocab_words, start=2):
    word2idx[word] = i

tag2idx = {tag: i for i, tag in enumerate(all_unique_tags)}
tag2idx["<PAD>"] = len(tag2idx)

In [34]:
def encode_sentences(sentences, tags, word2idx, tag2idx):
    X, Y = [], []
    unk_id = word2idx["<UNK>"]
    pad_tag_id = tag2idx["<PAD>"]
    
    for sent, tag_seq in zip(sentences, tags):
        x = [word2idx.get(word, unk_id) for word in sent]
        y = [tag2idx[tag] for tag in tag_seq]
        X.append(x)
        Y.append(y)
    return X, Y

X, Y = encode_sentences(sentences, pos_tags, word2idx, tag2idx)

In [35]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len = 0
for sentence in sentences:
    if max_len < len(sentence):
        max_len = len(sentence)
    else:
        continue

MAX_LEN = max_len

X_padded = pad_sequences(X, maxlen=MAX_LEN, padding='post', value=word2idx["<PAD>"])
y_padded = pad_sequences(Y, maxlen=MAX_LEN, padding='post', value=tag2idx["<PAD>"])

In [36]:
X_final = np.array(X_padded)
y_final = np.array(y_padded).reshape(-1, MAX_LEN, 1)  # для sparse_categorical_crossentropy

In [37]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, TimeDistributed

vocab_size = len(word2idx)
num_tags = len(tag2idx)

inputs = Input(shape=(MAX_LEN,))
embed = Embedding(vocab_size, 100, mask_zero=True)(inputs)
bilstm = Bidirectional(LSTM(128, return_sequences=True))(embed)
outputs = TimeDistributed(Dense(num_tags, activation='softmax'))(bilstm)

model = Model(inputs, outputs)
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.fit(X_final, y_final, batch_size=32, epochs=10, validation_split=0.1)

I0000 00:00:1759692528.279584   40542 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 4620 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3070 Ti, pci bus id: 0000:01:00.0, compute capability: 8.6


Epoch 1/10


2025-10-05 22:28:53.912401: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:473] Loaded cuDNN version 91301


[1m684/684[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 28ms/step - accuracy: 0.0708 - loss: 0.6603 - val_accuracy: 0.0857 - val_loss: 0.2588
Epoch 2/10
[1m684/684[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 27ms/step - accuracy: 0.0843 - loss: 0.1554 - val_accuracy: 0.0864 - val_loss: 0.2287
Epoch 3/10
[1m684/684[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 28ms/step - accuracy: 0.0855 - loss: 0.1144 - val_accuracy: 0.0871 - val_loss: 0.2195
Epoch 4/10
[1m684/684[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 29ms/step - accuracy: 0.0863 - loss: 0.0881 - val_accuracy: 0.0870 - val_loss: 0.2420
Epoch 5/10
[1m684/684[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 31ms/step - accuracy: 0.0869 - loss: 0.0682 - val_accuracy: 0.0865 - val_loss: 0.2628
Epoch 6/10
[1m684/684[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 28ms/step - accuracy: 0.0874 - loss: 0.0518 - val_accuracy: 0.0863 - val_loss: 0.2941
Epoch 7/10
[1m684/684[0m 

<keras.src.callbacks.history.History at 0x761aa2106620>

In [39]:
# Возьмём ваше предложение
test_sent = ['Премьер-министр', 'РФ', 'Владимир', 'Путин', 'подписал', 'распоряжение', 'о', 'переводе', 'Государственного', 'университета', '-', 'Высшей', 'школы', 'экономики', '(', 'ГУ-ВШЭ', ')', 'из', 'ведения', 'Минэкономразвития', 'в', 'ведение', 'Правительства', 'РФ', '.']

# Преобразуем в ID
x_test = [word2idx.get(w, word2idx["<UNK>"]) for w in test_sent]
x_test = pad_sequences([x_test], maxlen=MAX_LEN, padding='post', value=word2idx["<PAD>"])

# Предсказание
pred = model.predict(x_test)
pred_ids = pred[0].argmax(axis=-1)

# Тогда создайте обратный словарь:
idx2tag = {idx: tag for tag, idx in tag2idx.items()}

# Обратно в теги
pred_tags = [idx2tag[idx] for idx in pred_ids[:len(test_sent)]]
print(list(zip(test_sent, pred_tags)))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[('Премьер-министр', 'NOUN'), ('РФ', 'PROPN'), ('Владимир', 'PROPN'), ('Путин', 'PROPN'), ('подписал', 'VERB'), ('распоряжение', 'NOUN'), ('о', 'ADP'), ('переводе', 'NOUN'), ('Государственного', 'ADJ'), ('университета', 'NOUN'), ('-', 'PUNCT'), ('Высшей', 'ADJ'), ('школы', 'NOUN'), ('экономики', 'NOUN'), ('(', 'PUNCT'), ('ГУ-ВШЭ', 'PROPN'), (')', 'PUNCT'), ('из', 'ADP'), ('ведения', 'NOUN'), ('Минэкономразвития', 'PROPN'), ('в', 'ADP'), ('ведение', 'NOUN'), ('Правительства', 'NOUN'), ('РФ', 'PROPN'), ('.', 'PUNCT')]
