In [1]:
import warnings
warnings.filterwarnings('ignore')

In [1]:
from keras.models import Model, model_from_json, load_model
from keras.layers import Input, LSTM, Dense
from keras.callbacks import ModelCheckpoint
import numpy as np
import keras
from tqdm import tqdm
import pandas as pd
import pickle
from time import time
#from keras.utils.vis_utils import plot_model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
keras.__version__

'2.1.6'

In [4]:
# A fast and memory efficient implementation
# by Hjelmqvist, Sten
def levenshtein(s, t):
    # degenerate cases
    if s == t:
        return 0
    if len(s) == 0:
        return len(t)
    if len(t) == 0:
        return len(s)
  
    # create two work vectors of integer distances
    #int[] v0 = new int[t.Length + 1];
    #int[] v1 = new int[t.Length + 1];
    v0 = []
    v1 = []
  
    # initialize v0 (the previous row of distances)
    # this row is A[0][i]: edit distance for an empty s
    # the distance is just the number of characters to delete from t
    # for (int i = 0; i < v0.Length; i++)
    # v0[i] = i;
    for i in range(len(t)+1):
        v0.append(i)
        v1.append(0)
 
    for i in range(len(s)): 
        # calculate v1 (current row distances) from the previous row v0
        # first element of v1 is A[i+1][0]
        # edit distance is delete (i+1) chars from s to match empty t
        v1[0] = i + 1
  
        # use formula to fill in the rest of the row
        for j in range(len(t)):
            cost = 0 if s[i] == t[j] else 1;
            v1[j + 1] = min(v1[j]+1, v0[j+1]+1, v0[j]+cost)
  
        # copy v1 (current row) to v0 (previous row) for next iteration
        for j in range(len(t)+1):
            v0[j] = v1[j]
  
    return v1[len(t)]

In [5]:
def vectorize(df):
    eng_sentences = list(df['x'])
    fra_sentences = list(df['y'])
    #print(eng_sentences[:5])
    nb_samples = df.shape[0]
    tokenized_eng_sentences = np.zeros(shape = (nb_samples,max_len_eng_sent,len(eng_chars)), dtype='float32')
    tokenized_fra_sentences = np.zeros(shape = (nb_samples,max_len_fra_sent,len(fra_chars)), dtype='float32')
    target_data = np.zeros((nb_samples, max_len_fra_sent, len(fra_chars)),dtype='float32')
    # Vectorize the english and french sentences

    for i in tqdm(range(nb_samples)):
        for k,ch in enumerate(eng_sentences[i]):
            tokenized_eng_sentences[i,k,eng_char_to_index_dict[ch]] = 1

        for k,ch in enumerate(fra_sentences[i]):
            tokenized_fra_sentences[i,k,fra_char_to_index_dict[ch]] = 1

            # decoder_target_data will be ahead by one timestep and will not include the start character.
            if k > 0:
                target_data[i,k-1,fra_char_to_index_dict[ch]] = 1
    return tokenized_eng_sentences, tokenized_fra_sentences, target_data

In [6]:
def tokenize(seq):
    tokenized_eng_sentence = np.zeros(shape = (1,max_len_eng_sent,len(eng_chars)), dtype='float32')
    for k,ch in enumerate(seq):
        tokenized_eng_sentence[0,k,eng_char_to_index_dict[ch]] = 1
        
    return tokenized_eng_sentence


In [7]:
def decode_seq(inp_seq,encoder_model_inf,decoder_model_inf):
    
    # Initial states value is coming from the encoder 
    states_val = encoder_model_inf.predict(inp_seq)
    
    target_seq = np.zeros((1, 1, len(fra_chars)))
    target_seq[0, 0, fra_char_to_index_dict['\t']] = 1
    
    translated_sent = ''
    stop_condition = False
    
    while not stop_condition:
        
        #decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val)
        #predict_pr = decoder_model_inf.predict_proba(x=[target_seq] + states_val)
        decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val)
        #print(decoder_out)
        max_val_index = np.argmax(decoder_out[0,-1,:])
        sampled_fra_char = fra_index_to_char_dict[max_val_index]
        translated_sent += sampled_fra_char
        #print('{} == {}'.format(sampled_fra_char,max_val))
        
        if ( (sampled_fra_char == '\n') or (len(translated_sent) > max_len_fra_sent)) :
            stop_condition = True
        
        target_seq = np.zeros((1, 1, len(fra_chars)))
        target_seq[0, 0, max_val_index] = 1
        
        states_val = [decoder_h, decoder_c]
    return translated_sent

In [8]:
def fitting(step, model, tokenized_eng_sentences, tokenized_fra_sentences, target_data):
    print('fitting on {} words...'.format(tokenized_eng_sentences.shape[0]))
    t = time()
    model.fit(x=[tokenized_eng_sentences,tokenized_fra_sentences], 
              y=target_data,
              batch_size=64,
              epochs=40,
              validation_split=0.2,
             verbose = 0)
    #print(model.layers)
    print('creating new inference model...')
    # Encoder inference model
    encoder_model_inf = Model(encoder_input, encoder_states)
    # Decoder inference model
    decoder_state_input_h = Input(shape=(256,))
    decoder_state_input_c = Input(shape=(256,))
    decoder_input_states = [decoder_state_input_h, decoder_state_input_c]

    decoder_out, decoder_h, decoder_c = decoder_LSTM(decoder_input, 
                                                     initial_state=decoder_input_states)

    decoder_states = [decoder_h , decoder_c]

    decoder_out = decoder_dense(decoder_out)

    decoder_model_inf = Model(inputs=[decoder_input] + decoder_input_states,
                              outputs=[decoder_out] + decoder_states )
    print('saving inference models...')
    encoder_model_inf.save('retraining_models/lev/encoder-{}.h5'.format(step))
    decoder_model_inf.save('retraining_models/lev/decoder-{}.h5'.format(step))
    model.save('retraining_models/lev/model-{}.h5'.format(step))
    print('fitting took {} min'.format(round(((time()-t)/60),4)))
    return(encoder_model_inf,decoder_model_inf)

In [9]:
#на n-ной итерации берем n*10000 примеров из теста, из них добавляем 10000 к трейну
def testing(encoder_model_inf,decoder_model_inf,train,train_rest,count,step):
    t = time()
    length = max((train_rest).shape[0],step*count) #сколько отрезаем для теста или берем весь
    test = train_rest[:length]
    print('testing on {} words...'.format(test.shape[0]))
    predictions = []
    dists = []
    for row in tqdm(range(test.shape[0])):
        x = test.iloc[row,5]
        x_seq = tokenize(x)
        y = decode_seq(x_seq,encoder_model_inf,decoder_model_inf)
        prediction = y.strip()
        predictions.append(prediction)
        true_form = test.iloc[row,4]
        dist = levenshtein(prediction,true_form)
        dists.append(dist)
    test['predictions'] = predictions
    test['dists'] = dists
    test = test.sort_values(by=['dists'], ascending = False)
    name = 'test_after_{}_step.csv'.format(step)
    test[['x','predictions','dists']].to_csv('training_loop/test/lev/' + name, sep='\t', encoding = 'utf-8')
    test = test[:count] # берем худшие count строк
    #выкидываем из train_rest строки, которые уходят в трейн
    valuelist = test['x']
    train_rest = train_rest[~train_rest.x.isin(valuelist)]
    train = test.copy()
    name_train = 'train_rest_after_{}_step.csv'.format(step)
    train_rest[['x']].to_csv('training_loop/test/lev/' + name_train, sep='\t', encoding = 'utf-8')
    train = test.copy()
    print('testing took {} min'.format(round(((time()-t)/60),4)))
    return train,train_rest
    

### один раз вычитываем данные,  создаем encoder decoder, компилируем модель

In [7]:
total_train = pd.read_csv('data/train/train_full.csv', sep='\t', encoding = 'utf-8')
total_train['x'] = total_train.apply(lambda row: row['lemma'] + str(row['formtag']), axis=1)
total_train['y'] ='\t' + total_train["form"] + '\n'

In [8]:
train_filtered = total_train[total_train.apply(lambda x: not x['classtag'].endswith(('-','+','0')), axis=1)]

In [9]:
len(train_filtered)

509928

In [10]:
509928/12

42494.0

In [12]:
train_filtered[:5]

Unnamed: 0,lemma,gender,formtag,classtag,form,x,y
0,високос,м,2N,м1а,високосы,високос2N,\tвисокосы\n
1,зимовщик,мо,2N,мо3а,зимовщики,зимовщик2N,\tзимовщики\n
2,бензель,м,1D,м2а,бензелю,бензель1D,\tбензелю\n
3,хромоскоп,м,1A,м1а,хромоскоп,хромоскоп1A,\tхромоскоп\n
4,дерматология,ж,1L,ж7а,дерматологии,дерматология1L,\tдерматологии\n


In [13]:
data = pickle.load( open( "data.p", "rb" ))
eng_chars = data['eng_chars']
fra_chars = data['fra_chars']
max_len_eng_sent = data['max_len_eng_sent']
max_len_fra_sent = data['max_len_fra_sent']
eng_index_to_char_dict = data['eng_index_to_char_dict']
eng_char_to_index_dict = data['eng_char_to_index_dict']
fra_index_to_char_dict = data['fra_index_to_char_dict']
fra_char_to_index_dict = data['fra_char_to_index_dict']

In [14]:
# Encoder model
encoder_input = Input(shape=(None,len(eng_chars)))
encoder_LSTM = LSTM(256,return_state = True)
encoder_outputs, encoder_h, encoder_c = encoder_LSTM (encoder_input)
encoder_states = [encoder_h, encoder_c]
# Decoder model
decoder_input = Input(shape=(None,len(fra_chars)))
decoder_LSTM = LSTM(256,return_sequences=True, return_state = True)
decoder_out, _ , _ = decoder_LSTM(decoder_input, initial_state=encoder_states)
decoder_dense = Dense(len(fra_chars),activation='softmax')
decoder_out = decoder_dense (decoder_out)
model = Model(inputs=[encoder_input, decoder_input],outputs=[decoder_out])
# Compiling
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

In [15]:
count = 10000
train,train_rest = train_filtered[:count],train_filtered[count:]
step = 1
while train_rest.shape[0] > 0:
    print('iteration: {}, rest of train: {}'.format(step, train_rest.shape[0]))
    tokenized_eng_sentences, tokenized_fra_sentences, target_data = vectorize(train)
    encoder_model_inf,decoder_model_inf = fitting(step, model, tokenized_eng_sentences, tokenized_fra_sentences, target_data) 
    train,train_rest = testing(encoder_model_inf,decoder_model_inf,train,train_rest,count,step)
    step += 1

iteration: 1, rest of train: 373532


100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 49746.29it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 9.6806 min
testing on 373532 words...


100%|████████████████████████████████| 373532/373532 [1:12:21<00:00, 86.03it/s]


testing took 72.4239 min
iteration: 2, rest of train: 363532


100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 30711.66it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 9.4798 min
testing on 363532 words...


100%|████████████████████████████████| 363532/363532 [1:08:21<00:00, 88.62it/s]


testing took 68.4268 min
iteration: 3, rest of train: 353532


100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 47075.86it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 9.5308 min
testing on 353532 words...


100%|████████████████████████████████| 353532/353532 [1:07:53<00:00, 86.79it/s]


testing took 67.947 min
iteration: 4, rest of train: 343532


100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 39776.49it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 9.5909 min
testing on 343532 words...


100%|████████████████████████████████| 343532/343532 [1:03:56<00:00, 89.55it/s]


testing took 63.9932 min
iteration: 5, rest of train: 333532


100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 55430.87it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 9.5425 min
testing on 333532 words...


100%|████████████████████████████████| 333532/333532 [1:02:55<00:00, 88.34it/s]


testing took 62.9737 min
iteration: 6, rest of train: 323532


100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 46815.30it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 8.6143 min
testing on 323532 words...


100%|██████████████████████████████████| 323532/323532 [54:13<00:00, 99.45it/s]


testing took 54.2684 min
iteration: 7, rest of train: 313532


100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 60458.26it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 8.6353 min
testing on 313532 words...


100%|██████████████████████████████████| 313532/313532 [52:45<00:00, 99.04it/s]


testing took 52.8115 min
iteration: 8, rest of train: 303532


100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 46903.46it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 11.7574 min
testing on 303532 words...


100%|██████████████████████████████████| 303532/303532 [52:01<00:00, 97.23it/s]


testing took 52.0751 min
iteration: 9, rest of train: 293532


100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 50656.89it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 8.6818 min
testing on 293532 words...


100%|██████████████████████████████████| 293532/293532 [49:09<00:00, 99.51it/s]


testing took 49.2075 min
iteration: 10, rest of train: 283532


100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 50863.78it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 8.6818 min
testing on 283532 words...


100%|██████████████████████████████████| 283532/283532 [47:23<00:00, 99.72it/s]


testing took 47.4289 min
iteration: 11, rest of train: 273532


100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 54944.00it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 8.2581 min
testing on 273532 words...


100%|██████████████████████████████████| 273532/273532 [45:43<00:00, 99.69it/s]


testing took 45.7713 min
iteration: 12, rest of train: 263532


100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 50863.78it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 8.1812 min
testing on 263532 words...


100%|██████████████████████████████████| 263532/263532 [44:00<00:00, 99.80it/s]


testing took 44.0492 min
iteration: 13, rest of train: 253532


100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 55247.60it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 8.1937 min
testing on 253532 words...


100%|██████████████████████████████████| 253532/253532 [42:23<00:00, 99.69it/s]


testing took 42.4267 min
iteration: 14, rest of train: 243532


100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 46946.92it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 8.2215 min
testing on 243532 words...


100%|██████████████████████████████████| 243532/243532 [40:42<00:00, 99.73it/s]


testing took 40.7369 min
iteration: 15, rest of train: 233532


100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 51438.67it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 8.2176 min
testing on 233532 words...


100%|██████████████████████████████████| 233532/233532 [39:07<00:00, 99.49it/s]


testing took 39.1573 min
iteration: 16, rest of train: 223532


100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 45954.70it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 8.1962 min
testing on 223532 words...


100%|██████████████████████████████████| 223532/223532 [37:23<00:00, 99.62it/s]


testing took 37.4333 min
iteration: 17, rest of train: 213532


100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 51280.75it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 8.2148 min
testing on 213532 words...


100%|██████████████████████████████████| 213532/213532 [35:53<00:00, 99.16it/s]


testing took 35.9239 min
iteration: 18, rest of train: 203532


100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 44324.98it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 8.2598 min
testing on 203532 words...


100%|██████████████████████████████████| 203532/203532 [34:08<00:00, 99.34it/s]


testing took 34.1793 min
iteration: 19, rest of train: 193532


100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 54582.70it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 8.255 min
testing on 193532 words...


100%|██████████████████████████████████| 193532/193532 [32:22<00:00, 99.61it/s]


testing took 32.413 min
iteration: 20, rest of train: 183532


100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 47526.72it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 8.3054 min
testing on 183532 words...


100%|██████████████████████████████████| 183532/183532 [30:46<00:00, 99.41it/s]


testing took 30.7988 min
iteration: 21, rest of train: 173532


100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 46946.87it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 8.3171 min
testing on 173532 words...


100%|██████████████████████████████████| 173532/173532 [29:00<00:00, 99.73it/s]


testing took 29.0283 min
iteration: 22, rest of train: 163532


100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 47168.35it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 8.2104 min
testing on 163532 words...


100%|██████████████████████████████████| 163532/163532 [27:20<00:00, 99.67it/s]


testing took 27.3712 min
iteration: 23, rest of train: 153532


100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 58274.70it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 8.2408 min
testing on 153532 words...


100%|█████████████████████████████████| 153532/153532 [25:35<00:00, 100.01it/s]


testing took 25.6113 min
iteration: 24, rest of train: 143532


100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 58291.14it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 8.3439 min
testing on 143532 words...


100%|██████████████████████████████████| 143532/143532 [28:36<00:00, 83.62it/s]


testing took 28.6375 min
iteration: 25, rest of train: 133532


100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 37340.22it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 10.3011 min
testing on 133532 words...


100%|██████████████████████████████████| 133532/133532 [29:00<00:00, 76.72it/s]


testing took 29.037 min
iteration: 26, rest of train: 123532


100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 37063.42it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 10.303 min
testing on 123532 words...


100%|██████████████████████████████████| 123532/123532 [26:48<00:00, 76.79it/s]


testing took 26.8398 min
iteration: 27, rest of train: 113532


100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 37256.94it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 10.353 min
testing on 113532 words...


100%|██████████████████████████████████| 113532/113532 [24:37<00:00, 76.85it/s]


testing took 24.648 min
iteration: 28, rest of train: 103532


100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 40616.23it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 10.3054 min
testing on 103532 words...


100%|██████████████████████████████████| 103532/103532 [22:25<00:00, 76.93it/s]


testing took 22.4534 min
iteration: 29, rest of train: 93532


100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 36309.89it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 10.3332 min
testing on 93532 words...


100%|████████████████████████████████████| 93532/93532 [20:21<00:00, 76.57it/s]


testing took 20.3798 min
iteration: 30, rest of train: 83532


100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 37964.20it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 10.4763 min
testing on 83532 words...


100%|████████████████████████████████████| 83532/83532 [18:10<00:00, 76.62it/s]


testing took 18.1915 min
iteration: 31, rest of train: 73532


100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 37621.19it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 10.4457 min
testing on 73532 words...


100%|████████████████████████████████████| 73532/73532 [16:02<00:00, 76.43it/s]


testing took 16.0536 min
iteration: 32, rest of train: 63532


100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 44522.31it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 10.4789 min
testing on 63532 words...


100%|████████████████████████████████████| 63532/63532 [13:50<00:00, 76.46it/s]


testing took 13.865 min
iteration: 33, rest of train: 53532


100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 37480.16it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 10.4396 min
testing on 53532 words...


100%|████████████████████████████████████| 53532/53532 [11:37<00:00, 76.75it/s]


testing took 11.6395 min
iteration: 34, rest of train: 43532


100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 38490.13it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 10.4591 min
testing on 43532 words...


100%|████████████████████████████████████| 43532/43532 [09:27<00:00, 76.74it/s]


testing took 9.4679 min
iteration: 35, rest of train: 33532


100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 44522.35it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 10.4378 min
testing on 33532 words...


100%|████████████████████████████████████| 33532/33532 [07:16<00:00, 76.89it/s]


testing took 7.2805 min
iteration: 36, rest of train: 23532


100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 38254.71it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 10.4518 min
testing on 23532 words...


100%|████████████████████████████████████| 23532/23532 [05:06<00:00, 76.88it/s]


testing took 5.1118 min
iteration: 37, rest of train: 13532


100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 44013.06it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 10.4885 min
testing on 13532 words...


100%|████████████████████████████████████| 13532/13532 [02:55<00:00, 76.98it/s]


testing took 2.938 min
iteration: 38, rest of train: 3532


100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 44602.03it/s]


fitting on 10000 words...
creating new inference model...
saving inference models...
fitting took 10.5383 min
testing on 3532 words...


100%|██████████████████████████████████████| 3532/3532 [00:46<00:00, 76.31it/s]


testing took 0.7785 min


## testing Levenshtein

In [None]:
data = pd.read_csv('data/eval/eval_test_full_letters.csv', sep='\t', encoding = 'utf-8')

In [None]:
data = data[data.apply(lambda x: not x['classtag'].endswith(('-','+','0')), axis=1)]

In [None]:
data = data[['lemma','formtag','form','seq2seq_predictions']]

In [None]:

dists = []
for row in tqdm(range(data.shape[0])):
    prediction = data.iloc[row,3]
    true_form = data.iloc[row,2]
    dist = levenshtein(prediction,true_form)
    dists.append(dist)
data['dist'] = dists

In [None]:
data = data.sort_values(by=['dist'], ascending = False)

In [None]:
#data.loc[data['form'] == 'ржанцы']

In [None]:
data[:2000]