In [29]:
import re
import numpy as np

text = open('./data/LSTM_HABR.txt', 'r', encoding='utf-8').read().lower()
text = re.sub(r'[^а-я\s\d.!\?-]+', '', text)
text = re.sub(r'\s+', ' ', text) 
text = re.sub(r'[!\?]+', '.', text)
sentences = text.split('.')

In [30]:
def fill_strings(x, y, window):
    for sentence in sentences:
        sentence += '.'
        for i in range(0, len(sentence) - window):
            x.append(sentence[i:i + window])
            y.append(sentence[i + window])
    return len(x)        

In [31]:
chars = set(text)
size_vocab = len(chars)
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

window = 25

X = []
Y = []
    
size_sentences = fill_strings(X, Y, window)
x_train = np.zeros((size_sentences, window, size_vocab), dtype=np.bool_)
y_train = np.zeros((size_sentences, size_vocab), dtype=np.bool_)

for i in range(0, size_sentences):
    x_cur = X[i]
    y_cur = Y[i]
    for j, ch in enumerate(x_cur):
        x_train[i, j, char_to_int[ch]] = True
    y_train[i, char_to_int[y_cur]] = True 
    
print(x_train.shape[1:])    

(25, 42)


# RNN

In [24]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
np.random.seed(42)
 
model = Sequential(
    [
        LSTM(100, input_shape=x_train.shape[1:], dropout=0.2, recurrent_dropout=0.2, return_sequences=True),
        LSTM(100),
        Dense(y_train.shape[1], activation='softmax')
    ]
)
model.compile(loss="categorical_crossentropy", optimizer='adam')
model.fit(x_train, y_train, batch_size=20, epochs=40)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x1285e7370>

In [32]:
index = np.random.randint(0, len(x_train) - 1)
sentence = X[index]
print('Start sentence: "' + sentence + '"')
for i in range(100):
    x_pred = np.zeros((1, window, size_vocab))
    for t, char in enumerate(sentence):
        x_pred[0, t, char_to_int[char]] = 1.0
    preds = model.predict(x_pred)                    
    next_char = int_to_char[np.argmax(preds)]
    sentence = sentence[1:] + next_char
    print(next_char, end='')

Start sentence: "овые значения-кандидаты у"
станующих слово состояние ячейки объединяет в работе 2015 года провлемы долговременных зависимостей 

# Markov chane

In [33]:
from collections import defaultdict

window = 6
markov_X = []
markov_Y = []
fill_strings(markov_X, markov_Y, window)

nodes = defaultdict(lambda: defaultdict(lambda: 0))
for sentence, symbol in zip(markov_X, markov_Y):
    nodes[sentence][symbol] += 1

index = np.random.randint(0, len(markov_X) - 1)

sentence = markov_X[index]
print('Start sentence: "' + sentence + '"')
for i in range(100):
    next_chars_pool = [symbol for symbol in nodes[sentence]]
    probas = np.array([w for w in nodes[sentence].values()])
    probas = probas / probas.sum()
    if len(probas) == 0:
        break
    next_char = next_chars_pool[np.argmax(probas)]
    sentence = sentence[1:] + next_char
    print(next_char, end='')

Start sentence: "ный сл"
ой нейронные сети содержат обратные связи позволяющие сохранить а 0 полностью они были представлены 