In [27]:
import pandas as pd
pd.options.display.max_colwidth = 1000

import matplotlib.pyplot as plt
%matplotlib inline

import sys
sys.path.append('..')

from word2morph import Word2Morph
from word2morph.data.loaders import DataLoader

In [2]:
word2morph = Word2Morph.load_model(path='../logs/best-model.joblib')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.


In [3]:
train = DataLoader(file_path='../datasets/rus.train').load()
valid = DataLoader(file_path='../datasets/rus.valid').load()
test = DataLoader(file_path='../datasets/rus.test').load()

In [4]:
correct, wrong, predicted_samples = word2morph.evaluate(test, batch_size=1)


Evaluating for epoch 1...
{'test_acc': 0.9600512546889147,
 'test_acc_processed': 0.9600189173457508,
 'test_auc': 0.9743393176163997,
 'test_auc_processed': 0.9743105503871001,
 'test_f1': 0.9546705079727551,
 'test_f1_processed': 0.9546313196763908,
 'test_loss': 1.3797807815568301,
 'test_loss_processed': 1.380897673821573,
 'test_precision': 0.9597629994764508,
 'test_precision_processed': 0.9597406706562298,
 'test_recall': 0.9509018964213974,
 'test_recall_processed': 0.9508460589716579,
 'test_word_acc': 0.8689821755788772,
 'test_word_acc_processed': 0.8689821755788772}
Sample accuracy: 0.8689821755788772


In [7]:
len(correct), len(wrong), len(predicted_samples)

(20866, 3146, 24012)

## Wrong predictions

In [12]:
# pred, correct
str(wrong[0][0]), str(wrong[0][1])

('лядунка\tляд:ROOT/унк:SUFF/а:END', 'лядунка\tлядун:ROOT/к:SUFF/а:END')

In [30]:
df = pd.DataFrame(data=[(str(pred).split('\t')[0], str(pred).split('\t')[-1], 
                         str(cor).split('\t')[-1]) for pred, cor in wrong], 
                  columns=['Word', 'Predicted segments', 'Correct segments'])
df.head(n=50)

Unnamed: 0,Word,Predicted segments,Correct segments
0,лядунка,ляд:ROOT/унк:SUFF/а:END,лядун:ROOT/к:SUFF/а:END
1,жила,жи:ROOT/л:SUFF/а:END,жил:ROOT/а:END
2,выдыхательный,вы:PREF/дых:ROOT/а:SUFF/тель:SUFF/н:SUFF/ый:END,вы:PREF/дых:ROOT/а:SUFF/тельн:SUFF/ый:END
3,кокаиновый,кока:ROOT/ин:SUFF/ов:SUFF/ый:END,кок:ROOT/а:SUFF/ин:SUFF/ов:SUFF/ый:END
4,радиально-сверлильный,рад:ROOT/и:SUFF/альн:SUFF/о:LINK/-:HYPH/сверл:ROOT/и:SUFF/ль:SUFF/н:SUFF/ый:END,ради:ROOT/альн:SUFF/о:LINK/-:HYPH/сверл:ROOT/и:SUFF/ль:SUFF/н:SUFF/ый:END
5,рубчик,руб:ROOT/чик:SUFF,руб:ROOT/ч:SUFF/ик:SUFF
6,ороговеть,орог:ROOT/ов:SUFF/е:SUFF/ть:SUFF,о:PREF/рог:ROOT/ов:SUFF/е:SUFF/ть:SUFF
7,сотенка,сот:ROOT/енк:ROOT/а:END,сот:ROOT/ен:SUFF/к:SUFF/а:END
8,колоратура,колор:ROOT/ат:SUFF/ур:SUFF/а:END,колоратур:ROOT/а:END
9,подтвержденный,под:PREF/твержд:ROOT/енн:SUFF/ый:END,подтвержд:ROOT/енн:SUFF/ый:END


## Get accuracy by number of segments

In [21]:
max_nb_segments = max([len(c.segments) for p, c in correct] +
                      [len(c.segments) for p, c in wrong])
max_nb_segments

12

In [25]:
acc = [0] * 12
for length in range(1, max_nb_segments):
    correct_segments = [c.segments for p, c in correct if len(c.segments) == length]
    wrong_segments = [c.segments for p, c in wrong if len(c.segments) == length]
    
    acc[length] = len(correct_segments) / (len(wrong_segments) + len(correct_segments))

print(acc)

[0, 0.8358686949559647, 0.7747116968698518, 0.8276556776556776, 0.9014179608372721, 0.9277525022747952, 0.8538205980066446, 0.8378378378378378, 0.7647058823529411, 0.7272727272727273, 0.7567567567567568, 0.8]
