In [1]:
import re

import numpy as np
import keras
from keras.layers import Dense, Dropout, LSTM
from collections import defaultdict

In [2]:
text = open('./data/raw/text.txt', 'r', encoding='utf-8').read()
text = text.lower()
text = re.sub(r'[^a-zа-я\s.]+', '', text)
text = re.sub(r'\s+', ' ', text)

In [3]:
chars = sorted(list(set(text)))
char2int = dict()
int2char = dict()

for i, c in enumerate(chars):
    char2int[c] = i
    int2char[i] = c

In [4]:
window_length = 25
dataX = []
dataY = []
for sentence in text.split('.'):
  sentence += '.'
  for i in range(0, len(sentence) - window_length):
    dataX.append(sentence[i:i + window_length])
    dataY.append(sentence[i + window_length])

In [5]:
n_sentences = len(dataX)
X = np.zeros((n_sentences, window_length, len(chars)), dtype=np.bool_)
y = np.zeros((n_sentences, len(chars)), dtype=np.bool_)
for i, sentence in enumerate(dataX):
    for j, char in enumerate(sentence):
        X[i, j, char2int[char]] = True
    y[i, char2int[dataY[i]]] = True

In [6]:
model = keras.Sequential(
    [
        LSTM(256, input_shape=X.shape[1:], return_sequences=True),
        Dropout(0.2),
        LSTM(256),
        Dense(y.shape[1], activation='softmax')
    ]
)
model.compile(loss="categorical_crossentropy", optimizer='adam')
model.fit(X, y, batch_size=128, epochs=40)

2022-05-18 13:59:32.043154: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x7ff788ba02b0>

In [10]:
index = np.random.randint(0, n_sentences - 1)
pattern = dataX[index]

print('Pattern = "' + pattern + '"')
for _ in range(100):
    x = np.zeros((1, window_length, len(chars)))
    for j, char in enumerate(pattern):
        x[0, j, char2int[char]] = 1.0
    preds = model.predict(x)
    next_char = int2char[np.argmax(preds)]
    pattern = pattern[1:] + next_char
    print(str(next_char), end="")

Pattern = " рэперских горлах взрывае"
тся порохом..я не дуцу прагоилая стибе икакак каракений.и..ри арий нарицару...ириши....ит.я ках тари

In [11]:
markov_window_length = 5
markov_dataX = []
markov_dataY = []
for sentence in text.split('.'):
  sentence += '.'
  for i in range(0, len(sentence) - markov_window_length):
    markov_dataX.append(sentence[i:i + markov_window_length])
    markov_dataY.append(sentence[i + markov_window_length])

nodes = defaultdict(lambda: defaultdict(lambda: 0))
for sentence, symbol in zip(markov_dataX, markov_dataY):
  nodes[sentence][symbol] += 1

In [22]:
index = np.random.randint(0, len(markov_dataX) - 1)

pattern = markov_dataX[index]
print('Pattern = "' + pattern + '"')
for i in range(100):
    next_chars_pool = [symbol for symbol in nodes[pattern]]
    preds = np.array([w for w in nodes[pattern].values()])
    preds = preds / preds.sum()
    if len(preds) == 0:
      break
    next_char = next_chars_pool[np.argmax(preds)]
    pattern = pattern[1:] + next_char
    print(str(next_char), end='')

Pattern = "бронз"
овый кейс прячет розовый кейс прячет розовый кейс прячет розовый кейс прячет розовый кейс прячет роз