In [110]:
import pandas as pd
import numpy as np
import tensorflow as tf
import os
from os import listdir
from os.path import isfile, join
from collections import namedtuple
from tensorflow.python.layers.core import Dense
from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors
import time
import re
from sklearn.model_selection import train_test_split

In [111]:
def load_book(path):
    """Load a book from its file"""
    input_file = os.path.join(path)
    with open(input_file, encoding='utf-8') as f:
        book = f.read()
    return book

In [112]:
def clean_text(text):
    '''Remove unwanted characters and extra spaces from the text'''
    text = re.sub(r'\n', ' ', text) 
    text = re.sub(r'[{}@_*>()\\#%+=\[\]]','', text)
    text = re.sub('a0','', text)
    text = re.sub('\'92t','\'t', text)
    text = re.sub('\'92s','\'s', text)
    text = re.sub('\'92m','\'m', text)
    text = re.sub('\'92ll','\'ll', text)
    text = re.sub('\'91','', text)
    text = re.sub('\'92','', text)
    text = re.sub('\'93','', text)
    text = re.sub('\'94','', text)
    text = re.sub('\.','. ', text)
    text = re.sub('\!','! ', text)
    text = re.sub('\?','? ', text)
    text = re.sub(' +',' ', text)
    return text

In [113]:
prefix = ['авто', 'агро', 'аэро', 'био', 'вело', 'гелио', 'гео', 'гидро', 'зоо', 'изо', 'кино', 
          'макро', 'метео', 'микро', 'моно', 'мото', 'нео', 'палео', 'радио', 'стерео', 'теле', 'фото', 'электро']

In [114]:
first = ['без', 'более', 'ваши', 'веером', 'везти','висят','гола','голу','горло','грань',
         'грушу','дата','длина','доя','ест','завила','завить',
         'касается','кода','коем','мер','мера',
         'можно','нужно','нужны','опять','осень','пищите','плач',
         'полис','помощь','постое',
         'постое','почти','почта','рулей','салон','смысл','среде',
         'поведение','поведение','порез','просто','связи','смоли','точки',
        'точку','участь','частый','что']

In [115]:
second = ['бес', 'белее', 'вши', 'вечером', 'вести','весят','года','году','гордо','грант',
          'грущу','жата','длинна','для','есть','заявила','заявить',
          'качается','когда','коме','мэр','мэра',
          'модно','нудно','нудны','опят','очень','пишите','плачь',
          'полюс','помочь','простое',
          'постное','пости','поста','рублей','солон','смыл','среди',
          'повеление','проведение','парез','просо','вязи','смогли','токи',
          'току','учесть','частный','сто']

In [116]:
predlogs = ['а-ля',
'без',
'без ведома',
'безо',
'благодаря',
'близ',
'близко от',
'в',
'в виде',
'в зависимости от',
'в качестве',
'в лице',
'в отличие от',
'в отношении',
'в пандан',
'в преддверии',
'в продолжение',
'в результате',
'в роли',
'в связи с',
'в силу',
'в соответствии с',
'в течение',
'в целях',
'вблизи',
'ввиду',
'вглубь',
'вдогон',
'вдоль',
'вдоль по',
'взамен',
'включая',
'вкруг',
'вместо',
'вне',
'внизу',
'внутри',
'внутрь',
'во',
'вовнутрь',
'возле',
'вокруг',
'вопреки',
'вослед',
'впереди',
'вплоть до',
'впредь до',
'вразрез',
'вроде',
'вслед',
'вслед за',
'вследствие',
'для',
'для-ради',
'до',
'за',
'за вычетом',
'за исключением',
'за счёт',
'заместо',
'из',
'из-за',
'из-под',
'изнутри',
'изо',
'исключая',
'исходя из',
'к',
'касательно',
'ко',
'кроме',
'кругом',
'лицом к лицу с',
'меж',
'между',
'мимо',
'на',
'на виду у',
'на глазах у',
'на предмет',
'наверху',
'навроде',
'навстречу',
'над',
'надо',
'назад',
'назади',
'накануне',
'наместо',
'наперекор',
'наперерез',
'наподобие',
'напротив',
'наряду с',
'насупротив',
'насчёт',
'начиная с',
'не без',
'не в',
'не за',
'не считая',
'невзирая на',
'недалеко от',
'независимо от',
'несмотря',
'несмотря на',
'ниже',
'о',
'об',
'обо',
'обок',
'около',
'окрест',
'окромя',
'округ',
'опосля',
'опричь',
'от',
'от имени',
'от лица',
'относительно',
'ото',
'перед',
'передо',
'по',
'по линии',
'по мере',
'по направлению к',
'по отношению к',
'по поводу',
'по причине',
'по случаю',
'по сравнению с',
'по-за',
'по-над',
'по-под',
'поблизости от',
'поверх',
'под',
'под видом',
'под эгидой',
'подле',
'подо',
'подобно',
'позади',
'позднее',
'помимо',
'поперёд',
'поперёк',
'порядка',
'посверху',
'посередине',
'посередь',
'после',
'посреди',
'посредине',
'посредством',
'пред',
'предо',
'прежде',
'при',
'при помощи',
'применительно к',
'про',
'промеж',
'против',
'противу',
'путём',
'ради',
'рядом с',
'с',
'с ведома',
'с помощью',
'с прицелом на',
'с точки зрения',
'с целью',
'сверх',
'сверху',
'свыше',
'середь',
'сзади',
'сквозь',
'скрозь',
'следом за',
'смотря по',
'снизу',
'со',
'согласно',
'спустя',
'среди',
'средь',
'сродни',
'судя по',
'супротив',
'у',
'через',
'чрез']

In [117]:
len(predlogs)

197

In [118]:
def noise_maker_russian(sentence):
    noise_sentence = ''
    words = sentence.split(' ')
    error_class = 0
    
    for i in range(0, len(words)):
        if 'тся' in words[i]:
            parts = words[i].split('тся')
            words[i] = 'ться'.join(parts)
            error_class = 1
            break
            
        if 'ться' in words[i]:
            parts = words[i].split('ться')
            words[i] = 'тся'.join(parts)
            error_class = 1
            break
            
        if 'нн' in words[i]:
            parts = words[i].split('нн')
            words[i] = 'н'.join(parts)
            error_class = 2
            break
            
        if 'не' == words[i].lower():
            words[i] = 'ни'
            error_class = 3
            break
            
        if 'ни' == words[i].lower():
            words[i] = 'не'
            error_class = 3
            break
            
        if 'тоже' in words[i].lower():
            words[i] = 'то же'
            error_class = 4
            break
            
        if 'также' in words[i].lower():
            words[i] = 'так же'
            error_class = 4
            break

            
        if 'то' in words[i].lower():
            try:
                if 'же' in words[i+1].lower():
                    words[i] = 'тоже'
                    words[i+1] = ''
                    error_class = 4
                    break
                    
            except:
                words[i] = 'то'
                    
        if 'так' in words[i].lower():
            try:
                if 'же' in words[i+1].lower():
                    words[i] = 'также'
                    words[i+1] = ''
                    error_class = 4
                    break
            except:
                words[i] = 'так'
        
        if '-то' in words[i].lower():
                words[i] = ' то'
                error_class = 5
                break
                
        if words[i].lower() == 'то':
            if '-' not in words[i]:
                try:
                    if not ('же' in words[i+1].lower()):
                        words[i] = '-то'
                        error_class = 5
                        break
                        
                except:
                    words[i] = 'то'
                    
                    
        if 'что' in words[i].lower():
            try:
                if 'бы' in words[i+1].lower():
                    words[i] = 'чтобы'
                    words[i+1] = ''
                    error_class = 4
                    break
                    
            except:
                words[i] = 'что'
                
        if 'что' in words[i].lower():
            try:
                if 'б' in words[i+1].lower():
                    words[i] = 'чтоб'
                    words[i+1] = ''
                    error_class = 4
                    break
            except:
                words[i] = 'что'
                
        if 'чтобы' in words[i].lower():
            words[i] = 'что бы'
            error_class = 4
            break
            
        if 'чтоб' in words[i].lower():
            words[i] = 'что б'
            error_class = 4
            break
            
            
        #######################################
        
        if 'из-за' in words[i].lower():
            words[i] = 'из за'
            error_class = 4
            break
            
        if 'из-под' in words[i].lower():
            words[i] = 'из под'
            error_class = 4
            break
            
        if 'по-за' in words[i].lower():
            words[i] = 'по за'
            error_class = 4
            break
            
        if 'по-над' in words[i].lower():
            words[i] = 'по над'
            break
        
        if 'с-под' in words[i].lower():
            words[i] = 'с под'
            error_class = 4
            break
            
            
        if 'течение' in words[i].lower():
            words[i] = 'течении'
            error_class = 6
            break
            
        if 'течении' in words[i].lower():
            words[i] = 'течение'
            error_class = 6
            break
            
        if 'продолжение' in words[i].lower():
            words[i] = 'продолжении'
            error_class = 6
            break
            
        if 'вследствие' in words[i].lower():
            words[i] = 'в следствии'
            error_class = 6
            break
    
        if 'продолжении' in words[i].lower():
            words[i] = 'продолжение'
            error_class = 6
            break
            
        if 'следствии' in words[i].lower():
            words[i] = 'вследствие'
            error_class = 6
            break
            
            
        #######################################

        if 'насчет' in words[i].lower():
            words[i] = 'на счет'
            error_class = 6
            break
            
        if 'наподобие' in words[i].lower():
            words[i] = 'на подобие'
            error_class = 6
            break
            
        if 'ввиду' in words[i].lower():
            words[i] = 'в виду'
            error_class = 6
            break
            
        if 'вроде' in words[i].lower():
            words[i] = 'в роде'
            error_class = 6
            break
            
        if 'вместо' in words[i].lower():
            words[i] = 'в место'
            error_class = 6
            break
            
        if 'сначала' in words[i].lower():
            words[i] = 'с начала'
            error_class = 6
            break
            
        if 'вконец' in words[i].lower():
            words[i] = 'в конец'
            error_class = 6
            break
            
        if 'вширь' in words[i].lower():
            words[i] = 'в ширь'
            error_class = 6
            break
            
        if 'вдаль' in words[i].lower():
            words[i] = 'в даль'
            error_class = 6
            break
            
        if 'впору' in words[i].lower():
            words[i] = 'в пору'
            error_class = 6
            break
            
        if 'наверху' in words[i].lower():
            words[i] = 'на верху'
            error_class = 6
            break
            
        if 'впредь' in words[i].lower():
            words[i] = 'в предь'
            error_class = 6
            break
            
            
        #######################################

        
        if any(pref in words[i] for pref in prefix):
            words[i] = words[i].lower()
            for pref in prefix:
                check = words[i].split(pref)
                if check[0] == '':
                    words[i] = pref + ' ' + check[1]
                    error_class = 7
                    break 
                    
        if any(w in words[i] for w in first):
            words[i] = words[i].lower()
            for w in first:
                check = words[i].split(w)
                #print(w)
                #print(words[i])
                if check[0] == '' and check[1]=='' and ('естест' not in words[i]) and ('мермер' not in words[i]) and ('стосто' not in words[i]):
                    idx = first.index(words[i].split(',')[0])
                    words[i] = second[idx]
                    error_class = 8
                    break 
                            
        if any(w in words[i] for w in second):
            words[i] = words[i].lower()
            for w in second:
                check = words[i].split(w)
                if check[0] == '' and check[1]=='' and ('естест' not in words[i]) and ('мермер' not in words[i]) and ('стосто' not in words[i]):
                    idx = second.index(words[i].split(',')[0])
                    words[i] = first[idx]
                    error_class = 8
                    break 
        
        if any(pred == words[i] for pred in predlogs):
            words[i] = ''
            error_class = 9
            break 
        
    noise_sentence = ' '.join(words)
    return noise_sentence, error_class

# train

## Загрузка данных

In [119]:
import glob
book_files = []
for name in glob.glob('data/*'):
    book_files.append(name)
# Load the books using the file names
books = []
for book in book_files:
    if book!='.DS_Store':
        #print(book)
        #books.append(load_book(path+book))
        books.append(load_book(book))

In [120]:
# Compare the number of words in each book 
for i in range(len(books)):
    print("There are {} words in {}.".format(len(books[i].split()), book_files[i]))

There are 7364 words in data/full_text_7.txt.
There are 8679 words in data/full_text_1.txt.
There are 8712745 words in data/secret_hack.csv.
There are 43905 words in data/full_text_6.txt.
There are 3322 words in data/full_text.txt.
There are 34454 words in data/full_text_5.txt.
There are 10048 words in data/full_text_2.txt.
There are 8468 words in data/full_text_3.txt.
There are 19203 words in data/full_text_4.txt.


## Подготовка данных

In [121]:
# Clean the text of the books
clean_books = []
for book in books:
    clean_books.append(clean_text(book))

In [122]:
# Create a dictionary to convert the vocabulary (characters) to integers
vocab_to_int = {}
count = 0
for book in clean_books:
    for character in book:
        if character not in vocab_to_int:
            vocab_to_int[character] = count
            count += 1

# Add special tokens to vocab_to_int
codes = ['<PAD>','<EOS>','<GO>']
for code in codes:
    vocab_to_int[code] = count
    count += 1

In [123]:
# Check the size of vocabulary and all of the values
vocab_size = len(vocab_to_int)
print("The vocabulary contains {} characters.".format(vocab_size))
print(sorted(vocab_to_int))

The vocabulary contains 148 characters.
[' ', '!', '"', '&', "'", ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '<EOS>', '<GO>', '<PAD>', '?', 'A', 'B', 'C', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Y', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '«', '»', 'à', 'é', 'î', 'œ', '́', 'А', 'Б', 'В', 'Г', 'Д', 'Е', 'Ж', 'З', 'И', 'Й', 'К', 'Л', 'М', 'Н', 'О', 'П', 'Р', 'С', 'Т', 'У', 'Ф', 'Х', 'Ц', 'Ч', 'Ш', 'Щ', 'Ы', 'Ь', 'Э', 'Ю', 'Я', 'а', 'б', 'в', 'г', 'д', 'е', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я', 'ё', '—', '’']


In [124]:
# Create another dictionary to convert integers to their respective characters
int_to_vocab = {}
for character, value in vocab_to_int.items():
    int_to_vocab[value] = character

In [125]:
# Split the text from the books into sentences.
sentences = []
for book in clean_books:
    for sentence in book.split('.'):
        if len(sentence) < 100:
            sentences.append(sentence + '.')
print("There are {} sentences.".format(len(sentences)))
#596360 vs 362251

There are 362251 sentences.


In [126]:
for i in range(0, len(sentences)):
    sentences[i] = sentences[i].replace(';', ',')

In [127]:
clear_sentences = []
for i in range(0, len(sentences)):
    if sentences[i]!=' .':
        clear_sentences.append(sentences[i].lstrip())

In [128]:
%%time
noise_sentences = []
error_label = []
for sentence in clear_sentences:
    noise, cl = noise_maker_russian(sentence)
    noise_sentences.append(noise)
    error_label.append(cl)

CPU times: user 1min 6s, sys: 0 ns, total: 1min 6s
Wall time: 1min 6s


# writing files

In [129]:
with open('hack.txt', 'w') as file:
    for i in range(0, len(clear_sentences)):
        file.write(noise_sentences[i] + '\t' + clear_sentences[i] +'\t' +'. \n')
        

In [130]:
import keras.backend as K
from sklearn.metrics import confusion_matrix

def perf_measure(y_actual, y_hat):
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    for i in range(y_hat.shape[2]): 
        if y_actual[i]==y_hat[i]==1:
            TP += 1
        if y_hat[i]==1 and y_actual[i]!=y_hat[i]:
            FP += 1
        if y_actual[i]==y_hat[i]==0:
            TN += 1
        if y_hat[i]==0 and y_actual[i]!=y_hat[i]:
            FN += 1

    return(TP, FP, TN, FN)

def score(y_true, y_pred):
    tn, fp, fn, tp = perf_measure(y_true, y_pred)
    
    prec = tp / (tp + 30*fp)
    
    recall = tp / (tp + 2*fn)
    
    f1 = 2 * (prec * recall) / (prec + recall)
    return f1

# seq 2 seq model

In [168]:
from __future__ import print_function
from keras.models import Model
from keras.layers import Input, LSTM, Dense
import numpy as np

In [169]:
batch_size = 64  # Batch size for training.
epochs = 200 # Number of epochs to train for.
num_samples = 10000  # Number of samples to train on.



latent_dim = 264  # Latent dimensionality of the encoding space.
# Path to the data txt file on disk.
data_path = 'hack.txt'#'fra.txt'

In [170]:
# Vectorize the data.
input_texts = []
target_texts = []
input_characters = set()
target_characters = set()
with open(data_path, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')
for line in lines[: min(num_samples, len(lines) - 1)]:
    input_text, target_text, _ = line.split('\t')
    # We use "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character.
    target_text = '\t' + target_text + '\n'
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)

input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

print('Number of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

Number of samples: 10000
Number of unique input tokens: 118
Number of unique output tokens: 120
Max sequence length for inputs: 100
Max sequence length for outputs: 101


In [171]:
input_token_index = dict(
    [(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict(
    [(char, i) for i, char in enumerate(target_characters)])

encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.
    encoder_input_data[i, t + 1:, input_token_index[' ']] = 1.
    
    
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.
    decoder_input_data[i, t + 1:, target_token_index[' ']] = 1.
    decoder_target_data[i, t:, target_token_index[' ']] = 1.
    
    
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [172]:
# Run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
              metrics=['accuracy'])

In [173]:
from keras.callbacks import ModelCheckpoint

In [174]:
filepath='/home/karina/orfo/model3_{epoch:02d}.h5'
checkpoint = ModelCheckpoint(filepath, period=25, verbose=1, save_best_only=False)
callbacks_list = [checkpoint]

In [175]:
model.fit([np.copy(encoder_input_data), decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2,
          callbacks=callbacks_list,
         verbose=2)
# Save model
model.save('s2s3_200ep_tr.h5')

Train on 8000 samples, validate on 2000 samples
Epoch 1/200
 - 36s - loss: 2.1167 - acc: 0.5303 - val_loss: 1.9394 - val_acc: 0.4981
Epoch 2/200
 - 33s - loss: 1.6955 - acc: 0.5622 - val_loss: 1.6564 - val_acc: 0.5537
Epoch 3/200
 - 32s - loss: 1.4262 - acc: 0.6068 - val_loss: 1.4884 - val_acc: 0.5827
Epoch 4/200
 - 31s - loss: 1.2805 - acc: 0.6365 - val_loss: 1.3954 - val_acc: 0.6004
Epoch 5/200
 - 32s - loss: 1.1931 - acc: 0.6565 - val_loss: 1.3336 - val_acc: 0.6146
Epoch 6/200
 - 33s - loss: 1.1149 - acc: 0.6769 - val_loss: 1.2795 - val_acc: 0.6286
Epoch 7/200
 - 34s - loss: 1.0406 - acc: 0.6972 - val_loss: 1.2251 - val_acc: 0.6430
Epoch 8/200
 - 33s - loss: 0.9581 - acc: 0.7216 - val_loss: 1.1453 - val_acc: 0.6664
Epoch 9/200
 - 33s - loss: 0.8728 - acc: 0.7477 - val_loss: 1.0795 - val_acc: 0.6857
Epoch 10/200
 - 33s - loss: 0.7909 - acc: 0.7739 - val_loss: 1.0173 - val_acc: 0.7029
Epoch 11/200
 - 32s - loss: 0.7138 - acc: 0.7980 - val_loss: 0.9629 - val_acc: 0.7198
Epoch 12/200
 -

  '. They will not be included '


Epoch 26/200
 - 35s - loss: 0.1892 - acc: 0.9521 - val_loss: 0.2998 - val_acc: 0.9231
Epoch 27/200
 - 33s - loss: 0.1775 - acc: 0.9549 - val_loss: 0.2892 - val_acc: 0.9241
Epoch 28/200
 - 33s - loss: 0.1682 - acc: 0.9568 - val_loss: 0.2911 - val_acc: 0.9206
Epoch 29/200
 - 33s - loss: 0.1598 - acc: 0.9587 - val_loss: 0.2387 - val_acc: 0.9411
Epoch 30/200
 - 32s - loss: 0.1333 - acc: 0.9643 - val_loss: 0.2032 - val_acc: 0.9477
Epoch 34/200
 - 32s - loss: 0.1293 - acc: 0.9653 - val_loss: 0.1913 - val_acc: 0.9517
Epoch 35/200
 - 32s - loss: 0.1240 - acc: 0.9661 - val_loss: 0.2023 - val_acc: 0.9473
Epoch 36/200
 - 32s - loss: 0.1206 - acc: 0.9671 - val_loss: 0.1753 - val_acc: 0.9545
Epoch 37/200
 - 32s - loss: 0.1183 - acc: 0.9672 - val_loss: 0.1727 - val_acc: 0.9552
Epoch 38/200
 - 32s - loss: 0.1145 - acc: 0.9679 - val_loss: 0.1630 - val_acc: 0.9567
Epoch 39/200
 - 32s - loss: 0.1112 - acc: 0.9686 - val_loss: 0.1730 - val_acc: 0.9529
Epoch 40/200
 - 32s - loss: 0.1085 - acc: 0.9693 - val

In [None]:
from keras.models import load_model
#model = load_model('model3_25.h5')

In [177]:
# Define sampling models
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict(
    (i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items())

In [178]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_41 (InputLayer)           (None, None, 118)    0                                            
__________________________________________________________________________________________________
input_42 (InputLayer)           (None, None, 120)    0                                            
__________________________________________________________________________________________________
lstm_17 (LSTM)                  [(None, 264), (None, 404448      input_41[0][0]                   
__________________________________________________________________________________________________
lstm_18 (LSTM)                  [(None, None, 264),  406560      input_42[0][0]                   
                                                                 lstm_17[0][1]                    
          

In [179]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index['\t']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence


In [180]:
for seq_index in range(len(encoder_input_data)):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    
    print('Input sentence:', input_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)

Input sentence: ПИКОВАЯ ДАМА Пиковая дама означает тайную недоброжелательность.
Decoded sentence: ПИКОВАЯ ДАМА Пиковая дама означает тайную недоброжелательность.

Input sentence: Новейшая гадательная книга.
Decoded sentence: Новейшая гадательная книга.

Input sentence: Так,  ненастные дни, Занимались они Делом.
Decoded sentence: Так, в ненастные дни, Занимались они Делом.

Input sentence: Однажды играли  карты у конногвардейца Нарумова.
Decoded sentence: Однажды играли в карты у конногвардейца Нарумова.

Input sentence: Долгая зимняя ночь прошла незаметно, сели ужинать  пятом часу утра.
Decoded sentence: Долгая зимняя ночь прошла незалевались неи огнак, и лушадовольто миго и качлого.

Input sentence: Но шампанское явилось, разговор оживился, и все приняли  нем участие.
Decoded sentence: Но шампанское явилось, разговор оживился, и все приняли в нем участие.

Input sentence: — Что ты сделал, Сурин? — спросил хозяин.
Decoded sentence: — Что это ты делаешь? — спросил я.

Input sentence: — 

KeyboardInterrupt: 