In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from keras.models import Model, model_from_json, load_model
from keras.layers import Input, LSTM, Dense
from keras.callbacks import ModelCheckpoint
import numpy as np
import keras
from tqdm import tqdm
import pandas as pd
import pickle
from time import time
#from keras.utils.vis_utils import plot_model

Using TensorFlow backend.


In [3]:
keras.__version__

'2.1.5'

In [4]:
def vectorize(df):
    eng_sentences = list(df['x'])
    fra_sentences = list(df['y'])
    #print(eng_sentences[:5])
    nb_samples = df.shape[0]
    tokenized_eng_sentences = np.zeros(shape = (nb_samples,max_len_eng_sent,len(eng_chars)), dtype='float32')
    tokenized_fra_sentences = np.zeros(shape = (nb_samples,max_len_fra_sent,len(fra_chars)), dtype='float32')
    target_data = np.zeros((nb_samples, max_len_fra_sent, len(fra_chars)),dtype='float32')
    # Vectorize the english and french sentences

    for i in tqdm(range(nb_samples)):
        for k,ch in enumerate(eng_sentences[i]):
            tokenized_eng_sentences[i,k,eng_char_to_index_dict[ch]] = 1

        for k,ch in enumerate(fra_sentences[i]):
            tokenized_fra_sentences[i,k,fra_char_to_index_dict[ch]] = 1

            # decoder_target_data will be ahead by one timestep and will not include the start character.
            if k > 0:
                target_data[i,k-1,fra_char_to_index_dict[ch]] = 1
    return tokenized_eng_sentences, tokenized_fra_sentences, target_data

In [5]:
def tokenize(seq):
    tokenized_eng_sentence = np.zeros(shape = (1,max_len_eng_sent,len(eng_chars)), dtype='float32')
    for k,ch in enumerate(seq):
        tokenized_eng_sentence[0,k,eng_char_to_index_dict[ch]] = 1
        
    return tokenized_eng_sentence


In [6]:
def decode_seq(inp_seq,encoder_model_inf,decoder_model_inf):
    
    # Initial states value is coming from the encoder 
    states_val = encoder_model_inf.predict(inp_seq)
    
    target_seq = np.zeros((1, 1, len(fra_chars)))
    target_seq[0, 0, fra_char_to_index_dict['\t']] = 1
    
    translated_sent = ''
    stop_condition = False
    
    prob = 1.0
    while not stop_condition:
        
        #decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val)
        #predict_pr = decoder_model_inf.predict_proba(x=[target_seq] + states_val)
        decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val)
        #print(decoder_out)
        max_val_index = np.argmax(decoder_out[0,-1,:])
        max_val = np.max(decoder_out[0,-1,:])
        prob *= max_val
        sampled_fra_char = fra_index_to_char_dict[max_val_index]
        translated_sent += sampled_fra_char
        #print('{} == {}'.format(sampled_fra_char,max_val))
        
        if ( (sampled_fra_char == '\n') or (len(translated_sent) > max_len_fra_sent)) :
            stop_condition = True
        
        target_seq = np.zeros((1, 1, len(fra_chars)))
        target_seq[0, 0, max_val_index] = 1
        
        states_val = [decoder_h, decoder_c]
    
    prob = prob**(1/len(translated_sent))
    return translated_sent, prob

In [7]:
def fitting(step, model, tokenized_eng_sentences, tokenized_fra_sentences, target_data):
    print('fitting on {} words...'.format(tokenized_eng_sentences.shape[0]))
    t = time()
    model.fit(x=[tokenized_eng_sentences,tokenized_fra_sentences], 
              y=target_data,
              batch_size=64,
              epochs=40,
              validation_split=0.2,
             verbose = 0)
    #print(model.layers)
    print('creating new inference model...')
    # Encoder inference model
    encoder_model_inf = Model(encoder_input, encoder_states)
    # Decoder inference model
    decoder_state_input_h = Input(shape=(256,))
    decoder_state_input_c = Input(shape=(256,))
    decoder_input_states = [decoder_state_input_h, decoder_state_input_c]

    decoder_out, decoder_h, decoder_c = decoder_LSTM(decoder_input, 
                                                     initial_state=decoder_input_states)

    decoder_states = [decoder_h , decoder_c]

    decoder_out = decoder_dense(decoder_out)

    decoder_model_inf = Model(inputs=[decoder_input] + decoder_input_states,
                              outputs=[decoder_out] + decoder_states )
    print('saving inference models...')
    encoder_model_inf.save('retraining_models/encoder-{}.h5'.format(step))
    decoder_model_inf.save('retraining_models/decoder-{}.h5'.format(step))
    model.save('retraining_models/model-{}.h5'.format(step))
    print('fitting took {} min'.format(round(((time()-t)/60),4)))
    return(encoder_model_inf,decoder_model_inf)

In [8]:
#на n-ной итерации берем n*10000 примеров из теста, из них добавляем 10000 к трейну
def testing(encoder_model_inf,decoder_model_inf,train,train_rest,count,step):
    t = time()
    length = max((train_rest).shape[0],step*count) #сколько отрезаем для теста или берем весь
    test = train_rest[:length]
    print('testing on {} words...'.format(test.shape[0]))
    predictions = []
    probs = []
    for row in tqdm(range(test.shape[0])):
        x = test.iloc[row,5]
        x_seq = tokenize(x)
        y,prob = decode_seq(x_seq,encoder_model_inf,decoder_model_inf)
        predictions.append(y.strip())
        probs.append(prob)
    test['predictions'] = predictions
    test['probs'] = probs
    test = test.sort_values(by=['probs'])
    name = 'test_after_{}_step.csv'.format(step)
    test[['x','predictions','probs']].to_csv('training_loop/test/' + name, sep='\t', encoding = 'utf-8')
    test = test[:count] # берем худшие count строк
    #выкидываем из train_rest строки, которые уходят в трейн
    valuelist = test['x']
    train_rest = train_rest[~train_rest.x.isin(valuelist)]
    train = test.copy()
    name_train = 'train_rest_after_{}_step.csv'.format(step)
    train_rest[['x']].to_csv('training_loop/test/' + name_train, sep='\t', encoding = 'utf-8')
    train = test.copy()
    print('testing took {} min'.format(round(((time()-t)/60),4)))
    return train,train_rest
    

### один раз вычитываем данные,  создаем encoder decoder, компилируем модель

In [8]:
total_train = pd.read_csv('data/train/train_full.csv', sep='\t', encoding = 'utf-8')
total_train['x'] = total_train.apply(lambda row: row['lemma'] + str(row['formtag']), axis=1)
total_train['y'] ='\t' + total_train["form"] + '\n'

In [10]:
train_filtered = total_train[total_train.apply(lambda x: not x['classtag'].endswith(('-','+','0')), axis=1)]

In [11]:
data = pickle.load( open( "data.p", "rb" ))
eng_chars = data['eng_chars']
fra_chars = data['fra_chars']
max_len_eng_sent = data['max_len_eng_sent']
max_len_fra_sent = data['max_len_fra_sent']
eng_index_to_char_dict = data['eng_index_to_char_dict']
eng_char_to_index_dict = data['eng_char_to_index_dict']
fra_index_to_char_dict = data['fra_index_to_char_dict']
fra_char_to_index_dict = data['fra_char_to_index_dict']

In [12]:
# Encoder model
encoder_input = Input(shape=(None,len(eng_chars)))
encoder_LSTM = LSTM(256,return_state = True)
encoder_outputs, encoder_h, encoder_c = encoder_LSTM (encoder_input)
encoder_states = [encoder_h, encoder_c]
# Decoder model
decoder_input = Input(shape=(None,len(fra_chars)))
decoder_LSTM = LSTM(256,return_sequences=True, return_state = True)
decoder_out, _ , _ = decoder_LSTM(decoder_input, initial_state=encoder_states)
decoder_dense = Dense(len(fra_chars),activation='softmax')
decoder_out = decoder_dense (decoder_out)
model = Model(inputs=[encoder_input, decoder_input],outputs=[decoder_out])
# Compiling
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

In [13]:
count = 10000
train,train_rest = train_filtered[:count],train_filtered[count:]
step = 1
while train_rest.shape[0] > 0:
    print('iteration: {}, rest of train: {}'.format(step, train_rest.shape[0]))
    tokenized_eng_sentences, tokenized_fra_sentences, target_data = vectorize(train)
    encoder_model_inf,decoder_model_inf = fitting(step, model, tokenized_eng_sentences, tokenized_fra_sentences, target_data) 
    train,train_rest = testing(encoder_model_inf,decoder_model_inf,train,train_rest,count,step)
    step += 1

iteration: 1, rest of train: 373532


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 65603.14it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 11.3065 min
testing on 373532 words...


100%|█████████████████████████████████████████████████████████████████████████| 373532/373532 [58:44<00:00, 105.98it/s]


testing took 58.793 min
iteration: 2, rest of train: 363532


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 69744.19it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 11.0127 min
testing on 363532 words...


100%|█████████████████████████████████████████████████████████████████████████| 363532/363532 [56:15<00:00, 107.69it/s]


testing took 56.3067 min
iteration: 3, rest of train: 353532


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 62713.97it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 11.0465 min
testing on 353532 words...


100%|█████████████████████████████████████████████████████████████████████████| 353532/353532 [55:32<00:00, 106.09it/s]


testing took 55.5924 min
iteration: 4, rest of train: 343532


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 60081.28it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 11.2535 min
testing on 343532 words...


100%|█████████████████████████████████████████████████████████████████████████| 343532/343532 [53:44<00:00, 106.55it/s]


testing took 53.781 min
iteration: 5, rest of train: 333532


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 62334.17it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 11.0945 min
testing on 333532 words...


100%|█████████████████████████████████████████████████████████████████████████| 333532/333532 [52:40<00:00, 105.53it/s]


testing took 52.72 min
iteration: 6, rest of train: 323532


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 64350.15it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 11.1417 min
testing on 323532 words...


100%|█████████████████████████████████████████████████████████████████████████| 323532/323532 [51:06<00:00, 105.49it/s]


testing took 51.1573 min
iteration: 7, rest of train: 313532


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 66036.12it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 11.193 min
testing on 313532 words...


100%|█████████████████████████████████████████████████████████████████████████| 313532/313532 [49:38<00:00, 105.27it/s]


testing took 49.6797 min
iteration: 8, rest of train: 303532


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 64751.80it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 11.2035 min
testing on 303532 words...


100%|█████████████████████████████████████████████████████████████████████████| 303532/303532 [48:00<00:00, 105.38it/s]


testing took 48.0448 min
iteration: 9, rest of train: 293532


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 64333.76it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 11.2735 min
testing on 293532 words...


100%|█████████████████████████████████████████████████████████████████████████| 293532/293532 [46:00<00:00, 106.33it/s]


testing took 46.0493 min
iteration: 10, rest of train: 283532


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 63932.88it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 11.3051 min
testing on 283532 words...


100%|████████████████████████████████████████████████████████████████████████| 283532/283532 [1:03:10<00:00, 74.80it/s]


testing took 63.2523 min
iteration: 11, rest of train: 273532


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 36135.77it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 15.0605 min
testing on 273532 words...


100%|████████████████████████████████████████████████████████████████████████| 273532/273532 [1:10:39<00:00, 64.52it/s]


testing took 70.7256 min
iteration: 12, rest of train: 263532


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 40215.31it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 14.983 min
testing on 263532 words...


100%|████████████████████████████████████████████████████████████████████████| 263532/263532 [1:03:41<00:00, 68.97it/s]


testing took 63.7543 min
iteration: 13, rest of train: 253532


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 42621.49it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 14.8766 min
testing on 253532 words...


100%|████████████████████████████████████████████████████████████████████████| 253532/253532 [1:01:22<00:00, 68.86it/s]


testing took 61.4321 min
iteration: 14, rest of train: 243532


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 39734.97it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 15.0706 min
testing on 243532 words...


100%|██████████████████████████████████████████████████████████████████████████| 243532/243532 [58:25<00:00, 69.48it/s]


testing took 58.4829 min
iteration: 15, rest of train: 233532


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 35366.89it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 15.0255 min
testing on 233532 words...


100%|██████████████████████████████████████████████████████████████████████████| 233532/233532 [54:52<00:00, 70.93it/s]


testing took 54.9342 min
iteration: 16, rest of train: 223532


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 43362.92it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 14.7046 min
testing on 223532 words...


100%|██████████████████████████████████████████████████████████████████████████| 223532/223532 [50:39<00:00, 73.54it/s]


testing took 50.7169 min
iteration: 17, rest of train: 213532


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 42988.83it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 14.5336 min
testing on 213532 words...


100%|██████████████████████████████████████████████████████████████████████████| 213532/213532 [47:35<00:00, 74.79it/s]


testing took 47.6401 min
iteration: 18, rest of train: 203532


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 44723.88it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 14.6142 min
testing on 203532 words...


100%|██████████████████████████████████████████████████████████████████████████| 203532/203532 [46:44<00:00, 72.57it/s]


testing took 46.7949 min
iteration: 19, rest of train: 193532


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 39577.16it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 13.9151 min
testing on 193532 words...


100%|█████████████████████████████████████████████████████████████████████████| 193532/193532 [30:52<00:00, 104.45it/s]


testing took 30.9071 min
iteration: 20, rest of train: 183532


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 63525.61it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 11.9949 min
testing on 183532 words...


100%|█████████████████████████████████████████████████████████████████████████| 183532/183532 [28:47<00:00, 106.24it/s]


testing took 28.8164 min
iteration: 21, rest of train: 173532


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 66489.82it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 11.9826 min
testing on 173532 words...


100%|█████████████████████████████████████████████████████████████████████████| 173532/173532 [27:21<00:00, 105.73it/s]


testing took 27.3767 min
iteration: 22, rest of train: 163532


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 68297.90it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 12.0422 min
testing on 163532 words...


100%|█████████████████████████████████████████████████████████████████████████| 163532/163532 [26:41<00:00, 102.10it/s]


testing took 26.717 min
iteration: 23, rest of train: 153532


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 65615.25it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 12.0762 min
testing on 153532 words...


100%|█████████████████████████████████████████████████████████████████████████| 153532/153532 [24:46<00:00, 103.30it/s]


testing took 24.7919 min
iteration: 24, rest of train: 143532


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 64344.92it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 12.0321 min
testing on 143532 words...


100%|█████████████████████████████████████████████████████████████████████████| 143532/143532 [22:59<00:00, 104.01it/s]


testing took 23.0178 min
iteration: 25, rest of train: 133532


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 63526.58it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 12.0679 min
testing on 133532 words...


100%|█████████████████████████████████████████████████████████████████████████| 133532/133532 [21:22<00:00, 104.12it/s]


testing took 21.3947 min
iteration: 26, rest of train: 123532


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 64757.29it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 12.0951 min
testing on 123532 words...


100%|█████████████████████████████████████████████████████████████████████████| 123532/123532 [19:37<00:00, 104.88it/s]


testing took 19.6469 min
iteration: 27, rest of train: 113532


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 65185.22it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 12.1078 min
testing on 113532 words...


100%|█████████████████████████████████████████████████████████████████████████| 113532/113532 [18:11<00:00, 104.04it/s]


testing took 18.2016 min
iteration: 28, rest of train: 103532


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 65606.83it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 12.1773 min
testing on 103532 words...


100%|█████████████████████████████████████████████████████████████████████████| 103532/103532 [16:22<00:00, 105.41it/s]


testing took 16.387 min
iteration: 29, rest of train: 93532


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 66489.50it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 12.1877 min
testing on 93532 words...


100%|███████████████████████████████████████████████████████████████████████████| 93532/93532 [14:50<00:00, 105.03it/s]


testing took 14.8553 min
iteration: 30, rest of train: 83532


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 67844.74it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 12.205 min
testing on 83532 words...


100%|███████████████████████████████████████████████████████████████████████████| 83532/83532 [13:11<00:00, 105.51it/s]


testing took 13.2065 min
iteration: 31, rest of train: 73532


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 67849.13it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 12.2365 min
testing on 73532 words...


100%|███████████████████████████████████████████████████████████████████████████| 73532/73532 [11:30<00:00, 106.46it/s]


testing took 11.5227 min
iteration: 32, rest of train: 63532


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 63525.81it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 12.232 min
testing on 63532 words...


100%|███████████████████████████████████████████████████████████████████████████| 63532/63532 [09:50<00:00, 107.65it/s]


testing took 9.846 min
iteration: 33, rest of train: 53532


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 69260.25it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 12.2623 min
testing on 53532 words...


100%|███████████████████████████████████████████████████████████████████████████| 53532/53532 [08:13<00:00, 108.42it/s]


testing took 8.238 min
iteration: 34, rest of train: 43532


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 68317.70it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 12.3197 min
testing on 43532 words...


100%|███████████████████████████████████████████████████████████████████████████| 43532/43532 [06:40<00:00, 108.68it/s]


testing took 6.6838 min
iteration: 35, rest of train: 33532


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 72799.45it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 12.1597 min
testing on 33532 words...


100%|███████████████████████████████████████████████████████████████████████████| 33532/33532 [04:57<00:00, 112.74it/s]


testing took 4.9639 min
iteration: 36, rest of train: 23532


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 72784.42it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 12.1382 min
testing on 23532 words...


100%|███████████████████████████████████████████████████████████████████████████| 23532/23532 [03:27<00:00, 113.64it/s]


testing took 3.4569 min
iteration: 37, rest of train: 13532


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 72800.34it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 12.0941 min
testing on 13532 words...


100%|███████████████████████████████████████████████████████████████████████████| 13532/13532 [01:57<00:00, 115.14it/s]


testing took 1.9634 min
iteration: 38, rest of train: 3532


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 77907.17it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 12.1963 min
testing on 3532 words...


100%|█████████████████████████████████████████████████████████████████████████████| 3532/3532 [00:30<00:00, 114.50it/s]


testing took 0.518 min
