In [8]:
from google.colab import files
import sqlite3
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Embedding, Bidirectional, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import sentencepiece as spm

In [2]:
files.upload()  # Загрузите kaggle.json
# Установите Kaggle CLI, если еще не установлено
!pip install kaggle

# Создайте папку .kaggle и скопируйте туда kaggle.json
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Загрузите набор данных Wikibooks
!kaggle datasets download -d dhruvildave/wikibooks-dataset


Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/dhruvildave/wikibooks-dataset
License(s): CC-BY-SA-4.0
Downloading wikibooks-dataset.zip to /content
100% 1.82G/1.82G [00:19<00:00, 127MB/s]
100% 1.82G/1.82G [00:19<00:00, 101MB/s]


In [3]:
!unzip wikibooks-dataset.zip -d wikibooks

Archive:  wikibooks-dataset.zip
  inflating: wikibooks/wikibooks.sqlite  


In [3]:
conn = sqlite3.connect('/content/wikibooks/wikibooks.sqlite')
cursor = conn.cursor()

cursor.execute("SELECT body_text FROM ru Limit 100")
rows = cursor.fetchall()

# Вывод результатов (например, первые 10 строк)
for row in rows[:10]:
  print(row)

conn.close()


('Рабочая станция;\nСервер;\nПерсональный компьютер.',)
('В Википедии имеется статья по теме «Свидетельство частного пилота»\n\n\nГражданское пилотское свидетельство - разрешение на управление определенным видом воздушного судна. Внутри этого свидетельства может быть много разных отметок (а может и не быть), которые или урезают или увеличивают количество разрешенных функций. Самое понятное и известное массам это свидетельство частного пилота. Его выдают после обучения на самолете или на вертолете. Для краткости в разговорной речи это свидетельство могут обозначать через название его иностранного аналога - ППЛ или ПиПиЭль (PPL).  Раньше это свидетельство в России называлось свидетельство пилота-любителя.\nПорядок выдачи свидетельств описан в ФАП-147. Содержимое этого документа почти целиком повторяет Приложение 1 "Выдача свидетельств авиационному персоналу" к Чикагской конвенции. За исключением того что свидетельства СВС в нем нет и это российская особенность.\nПилотские в каждой стране

In [5]:
texts = [row[0] for row in rows]
seed_text = "Что делать если"

SRNN с посимвольной токенизацией

In [15]:
text = ' '.join(texts)
# Создаем токенизатор для символов
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(text)
vocab_size = len(tokenizer.word_index) + 1

# Преобразуем текст в последовательности символов
sequences = tokenizer.texts_to_sequences([text])[0]
max_sequence_length = 40  # Длина входной последовательности для RNN
step = 3  # Шаг между последовательностями

X = []
y = []

for i in range(0, len(sequences) - max_sequence_length, step):
    X.append(sequences[i:i + max_sequence_length])
    y.append(sequences[i + max_sequence_length])

X = np.array(X)
y = to_categorical(y, num_classes=vocab_size)

In [16]:
# Функция для генерации текста на основе символов
def generate_text_char(model, tokenizer, seed_text, max_sequence_length, num_chars):
    reverse_word_index = {index: char for char, index in tokenizer.word_index.items()}

    def index_to_char(index):
        return reverse_word_index.get(index, '')

    result = seed_text
    for _ in range(num_chars):
        # Преобразуем seed_text в последовательность индексов
        encoded = tokenizer.texts_to_sequences([seed_text])[0]
        encoded = pad_sequences([encoded], maxlen=max_sequence_length, padding='post')

        # Предсказываем следующий символ
        predicted_probs = model.predict(encoded, verbose=0)
        predicted_char_index = np.argmax(predicted_probs)

        # Преобразуем индекс в символ
        predicted_char = index_to_char(predicted_char_index)

        # Добавляем символ к текущему seed тексту
        seed_text = seed_text[1:] + predicted_char
        result += predicted_char

    return result

In [17]:
# Создаем модель
model_char = Sequential([
    Embedding(vocab_size, 50, input_length=max_sequence_length),
    SimpleRNN(128),
    Dense(vocab_size, activation='softmax')
])

model_char.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Обучаем модель
model_char.fit(X, y, batch_size=64, epochs=8, validation_split=0.2)





Epoch 1/8
[1m2033/2033[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 9ms/step - accuracy: 0.1818 - loss: 3.5280 - val_accuracy: 0.2452 - val_loss: 2.8883
Epoch 2/8
[1m2033/2033[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 7ms/step - accuracy: 0.3055 - loss: 2.5824 - val_accuracy: 0.2818 - val_loss: 2.7511
Epoch 3/8
[1m2033/2033[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 7ms/step - accuracy: 0.3543 - loss: 2.3715 - val_accuracy: 0.2999 - val_loss: 2.6931
Epoch 4/8
[1m2033/2033[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 7ms/step - accuracy: 0.3907 - loss: 2.2302 - val_accuracy: 0.3211 - val_loss: 2.6575
Epoch 5/8
[1m2033/2033[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 7ms/step - accuracy: 0.4163 - loss: 2.1281 - val_accuracy: 0.3277 - val_loss: 2.6436
Epoch 6/8
[1m2033/2033[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 7ms/step - accuracy: 0.4397 - loss: 2.0455 - val_accuracy: 0.3295 - val_loss: 2.6595
Epoch 7/8
[1m20

<keras.src.callbacks.history.History at 0x7b3ff99b8a60>

In [18]:
# Используем функцию для генерации текста

generated_text = generate_text_char(model_char, tokenizer, seed_text, max_sequence_length, num_chars=100)
print(generated_text)

Что делать если     .  --   .  .      --   .      --   .      --   .      --   .      --   .      --   .      --   


Однонаправленная однослойная LSTM с посимвольной токенизацией

In [19]:
# Создаем модель однослойной LSTM
model_char = Sequential([
    Embedding(vocab_size, 50, input_length=max_sequence_length),
    LSTM(128),  # Однонаправленная однослойная LSTM
    Dense(vocab_size, activation='softmax')
])

model_char.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Обучаем модель
model_char.fit(X, y, batch_size=64, epochs=10, validation_split=0.2)


generated_text_char = generate_text_char(model_char, tokenizer, seed_text, max_sequence_length, num_chars=100)
print("Generated Text (Char):", generated_text_char)

Epoch 1/10
[1m2033/2033[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 7ms/step - accuracy: 0.1769 - loss: 3.4604 - val_accuracy: 0.2250 - val_loss: 2.9693
Epoch 2/10
[1m2033/2033[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 7ms/step - accuracy: 0.2647 - loss: 2.7135 - val_accuracy: 0.2518 - val_loss: 2.8470
Epoch 3/10
[1m2033/2033[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 7ms/step - accuracy: 0.3025 - loss: 2.5430 - val_accuracy: 0.2728 - val_loss: 2.7681
Epoch 4/10
[1m2033/2033[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 7ms/step - accuracy: 0.3268 - loss: 2.4249 - val_accuracy: 0.2871 - val_loss: 2.7150
Epoch 5/10
[1m2033/2033[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 7ms/step - accuracy: 0.3522 - loss: 2.3240 - val_accuracy: 0.2984 - val_loss: 2.6861
Epoch 6/10
[1m2033/2033[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 7ms/step - accuracy: 0.3757 - loss: 2.2429 - val_accuracy: 0.3049 - val_loss: 2.6644
Epoch 7/10

Однонаправленная многослойная LSTM с посимвольной токенизацией

In [20]:
# Создаем модель многослойной LSTM
model_char_multi = Sequential([
    Embedding(vocab_size, 50, input_length=max_sequence_length),
    LSTM(128, return_sequences=True),
    LSTM(128),
    Dense(vocab_size, activation='softmax')
])

model_char_multi.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Обучаем модель
model_char_multi.fit(X, y, batch_size=64, epochs=10, validation_split=0.2)

# Используем ту же функцию для генерации текста
generated_text_char_multi = generate_text_char(model_char_multi, tokenizer, seed_text, max_sequence_length, num_chars=100)
print("Generated Text (Multi-layer Char):", generated_text_char_multi)

Epoch 1/10
[1m2033/2033[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 10ms/step - accuracy: 0.1566 - loss: 3.6712 - val_accuracy: 0.1910 - val_loss: 3.0871
Epoch 2/10
[1m2033/2033[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 10ms/step - accuracy: 0.2352 - loss: 2.8179 - val_accuracy: 0.2381 - val_loss: 2.9022
Epoch 3/10
[1m2033/2033[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 9ms/step - accuracy: 0.2776 - loss: 2.6381 - val_accuracy: 0.2601 - val_loss: 2.8180
Epoch 4/10
[1m2033/2033[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 10ms/step - accuracy: 0.3044 - loss: 2.5263 - val_accuracy: 0.2778 - val_loss: 2.7571
Epoch 5/10
[1m2033/2033[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 10ms/step - accuracy: 0.3299 - loss: 2.4250 - val_accuracy: 0.2896 - val_loss: 2.7287
Epoch 6/10
[1m2033/2033[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 9ms/step - accuracy: 0.3515 - loss: 2.3404 - val_accuracy: 0.2978 - val_loss: 2.7012
Epoch 

Двунаправленная LSTM с посимвольной токенизацией

In [None]:
# Создаем модель двунаправленной LSTM
model_char_bidir = Sequential([
    Embedding(vocab_size, 50, input_length=max_sequence_length),
    Bidirectional(LSTM(128)),
    Dense(vocab_size, activation='softmax')
])

model_char_bidir.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Обучаем модель
model_char_bidir.fit(X, y, batch_size=64, epochs=10, validation_split=0.2)


generated_text_char = generate_text_char(model_char_bidir, tokenizer, seed_text, max_sequence_length, num_chars=100)
print("Generated Text (Bidirectional Char):", generated_text_char)

Epoch 1/2
[1m381/381[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 188ms/step - accuracy: 0.2017 - loss: 3.5202 - val_accuracy: 0.1797 - val_loss: 3.2093
Epoch 2/2
[1m381/381[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 177ms/step - accuracy: 0.2984 - loss: 2.6790 - val_accuracy: 0.2633 - val_loss: 2.8756
Generated Text (Bidirectional Char): Что делать если   ,,,,)))))))))))))),),))))))))))))),),))))))))))))),),))))))))))))),),))))))))))))),),))))))))))))


SRNN с пословной токенизацией

In [6]:
# Создаем токенизатор для слов
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
vocab_size = len(tokenizer.word_index) + 1

# Определяем максимальную длину последовательности
max_sequence_length = 10

# Преобразуем тексты в последовательности индексов
sequences = tokenizer.texts_to_sequences(texts)
X = []
y = []

for sequence in sequences:
    for i in range(1, len(sequence)):
        n_gram_sequence = sequence[:i + 1]
        X.append(n_gram_sequence[:-1])
        y.append(n_gram_sequence[-1])

# Применяем `pad_sequences` для унификации длины входных данных
X = pad_sequences(X, maxlen=max_sequence_length, padding='pre')
y = to_categorical(y, num_classes=vocab_size)

# Создаем модель
model_word = Sequential([
    Embedding(vocab_size, 50, input_length=max_sequence_length),
    SimpleRNN(128),
    Dense(vocab_size, activation='softmax')
])

model_word.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Обучаем модель
model_word.fit(X, y, batch_size=64, epochs=10, validation_split=0.2)

# Функция для генерации текста на основе слов
def generate_text_word(model, tokenizer, seed_text, max_sequence_length, num_words):
    reverse_word_index = {index: word for word, index in tokenizer.word_index.items()}

    def index_to_word(index):
        return reverse_word_index.get(index, '')

    result = seed_text
    for _ in range(num_words):
        # Преобразуем seed_text в последовательность индексов
        encoded = tokenizer.texts_to_sequences([seed_text])[0]
        encoded = pad_sequences([encoded], maxlen=max_sequence_length, padding='pre')

        # Предсказываем следующее слово
        predicted_probs = model.predict(encoded, verbose=0)
        predicted_word_index = np.argmax(predicted_probs)

        # Преобразуем индекс в слово
        predicted_word = index_to_word(predicted_word_index)

        # Добавляем слово к текущему seed тексту
        seed_text += ' ' + predicted_word
        result += ' ' + predicted_word

    return result

# Используем функцию для генерации текста

generated_text = generate_text_word(model_word, tokenizer, seed_text, max_sequence_length, num_words=50)
print(generated_text)



Epoch 1/10
[1m824/824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 12ms/step - accuracy: 0.0198 - loss: 8.4595 - val_accuracy: 0.0269 - val_loss: 9.2699
Epoch 2/10
[1m824/824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 6ms/step - accuracy: 0.0357 - loss: 7.2385 - val_accuracy: 0.0285 - val_loss: 9.7567
Epoch 3/10
[1m824/824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.0766 - loss: 6.3640 - val_accuracy: 0.0332 - val_loss: 10.4670
Epoch 4/10
[1m824/824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - accuracy: 0.1275 - loss: 5.5142 - val_accuracy: 0.0275 - val_loss: 10.7543
Epoch 5/10
[1m824/824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 7ms/step - accuracy: 0.1926 - loss: 4.7577 - val_accuracy: 0.0252 - val_loss: 10.9113
Epoch 6/10
[1m824/824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.2677 - loss: 4.1101 - val_accuracy: 0.0275 - val_loss: 11.3986
Epoch 7/10
[1m824/8

Однонаправленная однослойная LSTM с пословной токенизацией

In [9]:
# Создаем модель однослойной LSTM
model_word = Sequential([
    Embedding(vocab_size, 50, input_length=max_sequence_length),
    LSTM(128),
    Dense(vocab_size, activation='softmax')
])

model_word.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Обучаем модель
model_word.fit(X, y, batch_size=64, epochs=10, validation_split=0.2)




generated_text_word = generate_text_word(model_word, tokenizer, seed_text, max_sequence_length, num_words=50)
print("Generated Text (Word):", generated_text_word)



Epoch 1/10
[1m824/824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 25ms/step - accuracy: 0.0171 - loss: 8.5366 - val_accuracy: 0.0262 - val_loss: 9.2075
Epoch 2/10
[1m824/824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 22ms/step - accuracy: 0.0280 - loss: 7.4708 - val_accuracy: 0.0259 - val_loss: 9.6724
Epoch 3/10
[1m824/824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 22ms/step - accuracy: 0.0442 - loss: 6.8876 - val_accuracy: 0.0291 - val_loss: 10.2237
Epoch 4/10
[1m824/824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 22ms/step - accuracy: 0.0693 - loss: 6.2713 - val_accuracy: 0.0340 - val_loss: 10.6018
Epoch 5/10
[1m824/824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 21ms/step - accuracy: 0.0993 - loss: 5.6668 - val_accuracy: 0.0320 - val_loss: 10.9698
Epoch 6/10
[1m824/824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 21ms/step - accuracy: 0.1362 - loss: 5.1131 - val_accuracy: 0.0300 - val_loss: 11.4197
Epoch 7/10


Однонаправленная многослойная LSTM с пословной токенизацией

In [10]:
# Создаем модель многослойной LSTM
model_word_multi = Sequential([
    Embedding(vocab_size, 50, input_length=max_sequence_length),
    LSTM(128, return_sequences=True),
    LSTM(128),
    Dense(vocab_size, activation='softmax')
])

model_word_multi.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Обучаем модель
model_word_multi.fit(X, y, batch_size=64, epochs=10, validation_split=0.2)


generated_text_word_multi = generate_text_word(model_word_multi, tokenizer, seed_text, max_sequence_length, num_words=50)
print("Generated Text (Multi-layer Word):", generated_text_word_multi)

Epoch 1/10
[1m824/824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 26ms/step - accuracy: 0.0196 - loss: 8.5639 - val_accuracy: 0.0253 - val_loss: 9.2835
Epoch 2/10
[1m824/824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 23ms/step - accuracy: 0.0296 - loss: 7.5495 - val_accuracy: 0.0240 - val_loss: 9.7553
Epoch 3/10
[1m824/824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 23ms/step - accuracy: 0.0347 - loss: 7.2717 - val_accuracy: 0.0228 - val_loss: 10.0264
Epoch 4/10
[1m824/824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 23ms/step - accuracy: 0.0378 - loss: 6.9823 - val_accuracy: 0.0253 - val_loss: 10.3036
Epoch 5/10
[1m824/824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 26ms/step - accuracy: 0.0502 - loss: 6.6657 - val_accuracy: 0.0238 - val_loss: 10.4617
Epoch 6/10
[1m824/824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 23ms/step - accuracy: 0.0601 - loss: 6.3975 - val_accuracy: 0.0199 - val_loss: 10.7017
Epoch 7/10


Однонаправленная однослойная LSTM на основе BPE (Byte-Pair Encoding)

In [13]:
# Импорт необходимых библиотек
import sentencepiece as spm
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, Dense
import numpy as np


text = ' '.join(texts)

# Запись текста в файл для обучения модели SentencePiece
with open('bpe_input.txt', 'w') as f:
    f.write(text)

# Обучение модели SentencePiece
spm.SentencePieceTrainer.train(input='bpe_input.txt', model_prefix='bpe', vocab_size=238, model_type='bpe')

# Загрузка обученной модели SentencePiece
sp = spm.SentencePieceProcessor()
sp.load('bpe.model')

# Преобразование текста в последовательности индексов
def text_to_bpe_sequences(text, max_sequence_length, step=3):
    sequences = sp.encode_as_ids(text)
    X = []
    y = []
    if len(sequences) <= max_sequence_length:
        max_sequence_length = len(sequences) - 1
        step = 1
    for i in range(0, len(sequences) - max_sequence_length, step):
        X.append(sequences[i:i + max_sequence_length])
        y.append(sequences[i + max_sequence_length])
    return np.array(X), to_categorical(y, num_classes=sp.get_piece_size())

# Преобразование текста в последовательности BPE индексов
max_sequence_length_bpe = 30
X_bpe, y_bpe = text_to_bpe_sequences(text, max_sequence_length_bpe)

# Создание модели однослойной LSTM на основе BPE
model_bpe = Sequential([
    Embedding(sp.get_piece_size(), 50, input_length=max_sequence_length_bpe),
    LSTM(128),
    Dense(sp.get_piece_size(), activation='softmax')
])

model_bpe.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Обучение модели
model_bpe.fit(X_bpe, y_bpe, batch_size=64, epochs=10, validation_split=0.2)

def generate_text_bpe(model, sp, seed_text, max_sequence_length, num_pieces):
    result = seed_text
    for _ in range(num_pieces):
        encoded = sp.encode_as_ids(seed_text)
        encoded = pad_sequences([encoded], maxlen=max_sequence_length, padding='pre')

        predicted_probs = model.predict(encoded, verbose=0)
        predicted_piece_index = np.argmax(predicted_probs)

        predicted_piece_index = int(predicted_piece_index)

        predicted_piece = sp.id_to_piece(predicted_piece_index)

        result += predicted_piece
        seed_text = result  # Обновляем текст для следующей итерации

    return result

generated_text_bpe = generate_text_bpe(model_bpe, sp, seed_text, max_sequence_length_bpe, num_pieces=50)
print("Generated Text (BPE):", generated_text_bpe)

Epoch 1/10
[1m1940/1940[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 6ms/step - accuracy: 0.1649 - loss: 3.4981 - val_accuracy: 0.2283 - val_loss: 2.9856
Epoch 2/10
[1m1940/1940[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 7ms/step - accuracy: 0.2511 - loss: 2.8095 - val_accuracy: 0.2647 - val_loss: 2.8527
Epoch 3/10
[1m1940/1940[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 6ms/step - accuracy: 0.2911 - loss: 2.6312 - val_accuracy: 0.2783 - val_loss: 2.7753
Epoch 4/10
[1m1940/1940[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 7ms/step - accuracy: 0.3259 - loss: 2.4823 - val_accuracy: 0.3051 - val_loss: 2.7212
Epoch 5/10
[1m1940/1940[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 6ms/step - accuracy: 0.3571 - loss: 2.3681 - val_accuracy: 0.3158 - val_loss: 2.6865
Epoch 6/10
[1m1940/1940[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 6ms/step - accuracy: 0.3809 - loss: 2.2738 - val_accuracy: 0.3203 - val_loss: 2.6535
Epoch 7/10

Многослойная LSTM на основе BPE

In [14]:
# Создание многослойной модели LSTM на основе BPE
model_bpe_multi = Sequential([
    Embedding(sp.get_piece_size(), 50, input_length=max_sequence_length_bpe),
    LSTM(128, return_sequences=True),
    LSTM(128),
    Dense(sp.get_piece_size(), activation='softmax')
])

model_bpe_multi.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Обучение модели
model_bpe_multi.fit(X_bpe, y_bpe, batch_size=64, epochs=10, validation_split=0.2)

# Используем ту же функцию для генерации текста на основе BPE
generated_text_bpe_multi = generate_text_bpe(model_bpe_multi, sp, seed_text, max_sequence_length_bpe, num_pieces=50)
print("Generated Text (Multi-layer BPE):", generated_text_bpe_multi)

Epoch 1/10
[1m1940/1940[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 9ms/step - accuracy: 0.1536 - loss: 3.5940 - val_accuracy: 0.2136 - val_loss: 3.0470
Epoch 2/10
[1m1940/1940[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 8ms/step - accuracy: 0.2288 - loss: 2.9160 - val_accuracy: 0.2387 - val_loss: 2.9419
Epoch 3/10
[1m1940/1940[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 8ms/step - accuracy: 0.2601 - loss: 2.7568 - val_accuracy: 0.2550 - val_loss: 2.8597
Epoch 4/10
[1m1940/1940[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 8ms/step - accuracy: 0.2870 - loss: 2.6354 - val_accuracy: 0.2737 - val_loss: 2.8031
Epoch 5/10
[1m1940/1940[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 8ms/step - accuracy: 0.3123 - loss: 2.5179 - val_accuracy: 0.2899 - val_loss: 2.7638
Epoch 6/10
[1m1940/1940[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 8ms/step - accuracy: 0.3387 - loss: 2.4241 - val_accuracy: 0.2998 - val_loss: 2.7285
Epoch 7/10