In [1]:
from keras.models import Model, model_from_json, load_model
from keras.layers import Input, LSTM, Dense
from keras.callbacks import ModelCheckpoint
import numpy as np
import keras
from tqdm import tqdm
import pandas as pd
import pymorphy2

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### data

In [2]:
path_to_rfile = 'data/GIKRYA_texts_new/gikrya_new_train.out'
path_to_wfile = 'data/eval/eval_gikrya_train.csv'

In [6]:
# для ГИКРЯ
data = pd.read_csv(path_to_rfile, encoding='utf-8',sep='\t',names = ['index','form','lemma','POS','gram'])

In [7]:
data.shape

(815884, 5)

In [4]:
data = data.loc[data['POS']=='NOUN']
data = data[['lemma','gram','form']]

In [5]:
def converter(gram):
    grams = gram.split('|')
    dc_gram = {}
    for gram in grams:
        k,v = gram.split('=')
        dc_gram[k] = v
    tag = ''
    if dc_gram['Number'].startswith('S'):
        tag += '1'
    elif dc_gram['Number'].startswith('P'):
        tag += '2'
    else: print(gram)
    tag += dc_gram['Case'][0]
    return tag

In [6]:
for i in range(0,len(data)):
    try:
        data.iloc[i,1] = converter(data.iloc[i,1])
    except Exception as e:
        #print(e)
        #print(data.iloc[[i]])
        data.iloc[[i]] = 0
len(data)   

182633

In [7]:
data = data[data.gram != 0]
len(data)

182633

In [8]:
data['lemma'] = data.apply(lambda row: row['lemma'].lower(), axis=1)

In [9]:
data['form'] = data.apply(lambda row: row['form'].lower(), axis=1)

In [10]:
data['x'] = data.apply(lambda row: row['lemma'] + row['gram'], axis=1)

In [85]:
#data = data[['x','form']]

### checking pymorphy

In [4]:
morph = pymorphy2.MorphAnalyzer()

In [63]:
word = morph.parse('бита')

In [64]:
word

[Parse(word='бита', tag=OpencorporaTag('ADJS,Qual femn,sing'), normal_form='битый', score=0.25, methods_stack=((<DictionaryAnalyzer>, 'бита', 4, 28),)),
 Parse(word='бита', tag=OpencorporaTag('NOUN,inan,masc sing,gent'), normal_form='бит', score=0.25, methods_stack=((<DictionaryAnalyzer>, 'бита', 236, 1),)),
 Parse(word='бита', tag=OpencorporaTag('NOUN,inan,femn sing,nomn'), normal_form='бита', score=0.25, methods_stack=((<DictionaryAnalyzer>, 'бита', 441, 0),)),
 Parse(word='бита', tag=OpencorporaTag('PRTS,impf,past,pssv femn,sing'), normal_form='бить', score=0.25, methods_stack=((<DictionaryAnalyzer>, 'бита', 444, 97),))]

In [282]:
#table from seq2seq to pymorphy
trans_number = {'1':'sing', '2':'plur'}
trans_case = {'N':'nomn','G':'gent','D':'datv','A':'accs','I':'ablt','L':'loct'}

In [284]:
lemma = "миноритарий"
number = 'sing'
case = 'ablt'

In [285]:
py_lemmas = morph.parse(lemma)
py_lemma = '-'
for p_lemma in py_lemmas:
    if p_lemma.tag.POS == 'NOUN':
        py_lemma = p_lemma
        break 

In [286]:
py_lemma

Parse(word='миноритарий', tag=OpencorporaTag('NOUN,inan,masc sing,nomn'), normal_form='миноритарий', score=0.5, methods_stack=((<FakeDictionary>, 'миноритарий', 81, 0), (<KnownSuffixAnalyzer>, 'тарий')))

In [287]:
py_method = str(py_lemma.methods_stack[0][0])
py_method

'<FakeDictionary>'

In [288]:
prediction = py_lemma.inflect({number, case}).word
prediction

'миноритарием'

py_lemma.inflect({number, case})

In [186]:
number

'sing'

### checking seq2seq model

In [245]:
from model import predict_form, predict_paradigm

In [246]:
enc = load_model('models/encoder_total.h5')
dec = load_model('models/decoder_total.h5')



In [252]:
predict_paradigm('зашквар',enc,dec)

Unnamed: 0,Singular,Plural
Nom,зашквар,зашквары
Gen,зашквара,зашкваров
Dat,зашквару,зашкварам
Acc,зашквара,зашкваров
Ins,зашкваром,зашкварами
Loc,зашкваре,зашкварах


In [12]:
data.iloc[[35]]

Unnamed: 0,lemma,gram,form,x
143,блок,2G,блоков,блок2G


### looping through data and predicting forms with pymorphy

In [190]:
%%time
pymorphy_predictions = []
pymorphy_eval = []
pymorphy_exeptions = []
pymorphy_methods = []
for row in range(len(data)):
    lemma = data.iloc[row,0]
    #print(row)
    #print(data.iloc[row,1])
    number,case = str(data.iloc[row,1])
    number = trans_number[number]
    case = trans_case[case]
    py_lemmas = morph.parse(lemma)
    py_lemma = '-'
    for p_lemma in py_lemmas:
        if p_lemma.tag.POS == 'NOUN':
            py_lemma = p_lemma
            break 
    try:
        py_method = str(py_lemma.methods_stack[0][0])
        prediction = py_lemma.inflect({number, case}).word        
    except Exception as e:
        pymorphy_exeptions.append((number,case,lemma))
        prediction =  '-'
        py_method = '-'      
        #print('{} {}({})'.format(number,case,lemma))  
    pymorphy_methods.append(py_method)
    pymorphy_predictions.append(prediction)
    true_form = data.iloc[row,2]
    if prediction == true_form:
        pymorphy_eval.append('True')
    else:
        pymorphy_eval.append('False')

Wall time: 1min 5s


In [191]:
data['pymorphy_predictions'] = pymorphy_predictions
data['pymorphy_eval'] = pymorphy_eval
data['pymorphy_methods'] = pymorphy_methods

### predicting forms with seq2seq

In [20]:
%%time
seq2seq_predictions = []
seq2seq_eval = []
for row in tqdm(range(len(data))):
#for row in tqdm(range(100)):
    inp = data.iloc[row,3]
    out = predict_form(inp,enc,dec).rstrip()
    seq2seq_predictions.append(out)
    if out == data.iloc[row,2]:
        seq2seq_eval.append('True')
    else:
        seq2seq_eval.append('False')

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 52.23it/s]


Wall time: 1.92 s


In [23]:
data['seq2seq_predictions'] = seq2seq_predictions
data['seq2seq_eval'] = seq2seq_eval


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [20]:
#uploading results from file
seq2seq_predictions = open('testing/model_predict_full_letters.txt','r',encoding = 'utf-8').read().split('\n')
data['seq2seq_predictions'] = seq2seq_predictions
seq2seq_eval = []
for row in range(data.shape[0]):
    true_form = data.iloc[row,4]
    prediction = data.iloc[row,7]
    if prediction == true_form:
        seq2seq_eval.append('True')
    else:
        seq2seq_eval.append('False')
data['seq2seq_eval'] = seq2seq_eval      

In [217]:
data.to_csv(path_to_wfile, encoding='utf-8',sep='\t', index=False)
#writer = pd.ExcelWriter(path_to_wfile.replace('.csv', '.xlsx'))
#data.to_excel(writer,'Sheet1')
#writer.save()
#для замеров аккуратности без учета "ошибок" на неизменяемых, pl.tantum, sg.tantum
#data_filtered = data[data.apply(lambda x: not x['classtag'].endswith(('-','+','0')), axis=1)] 
#data_filtered.to_csv(path_to_wfile_filtered, encoding='utf-8',sep='\t', index=False)

In [3]:
data = pd.read_csv(path_to_wfile, encoding='utf-8',sep='\t')

In [4]:
data.shape

(182633, 9)

In [229]:
data['pymorphy_predictions'] = data['pymorphy_predictions'].apply(lambda x: x.replace("ё","е"))

In [231]:
data['seq2seq_predictions'] = data['seq2seq_predictions'].apply(lambda x: x.replace("ё","е"))

In [5]:
# не словарные слова для pymorphy
data_filtered = data[data.apply(lambda x: x['pymorphy_methods'] != '<DictionaryAnalyzer>', axis=1)]

In [6]:
len(data_filtered)

2045

In [9]:
data_filtered[(data_filtered['pymorphy_eval'] == True) & (data_filtered['seq2seq_eval'] == False)]

Unnamed: 0,lemma,gram,form,x,pymorphy_predictions,pymorphy_eval,seq2seq_predictions,seq2seq_eval,pymorphy_methods
116,65-летие,1D,65-летию,65-летие1D,65-летию,True,-,False,<HyphenatedWordsAnalyzer>
1314,шварценеггер,1D,шварценеггеру,шварценеггер1D,шварценеггеру,True,шварценгерегу,False,<FakeDictionary>
2890,мень,1G,меня,мень1G,меня,True,мени,False,<FakeDictionary>
3451,распадская,1N,распадская,распадская1N,распадская,True,распадсаби,False,<FakeDictionary>
4203,бандюк,2A,бандюков,бандюк2A,бандюков,True,бандюки,False,<FakeDictionary>
5860,плей-офф,1L,плей-офф,плей-офф1L,плей-офф,True,плей-оффе,False,<FakeDictionary>
7105,распадская,1N,распадская,распадская1N,распадская,True,распадсаби,False,<FakeDictionary>
8201,усть-каменогорск,1A,усть-каменогорск,усть-каменогорск1A,усть-каменогорск,True,усть-каменокормк,False,<HyphenatedWordsAnalyzer>
14921,фото-рассказ,1N,фото-рассказ,фото-рассказ1N,фото-рассказ,True,фото-рассика,False,<HyphenatedWordsAnalyzer>
15347,пипец,1A,пипец,пипец1A,пипец,True,пипца,False,<FakeDictionary>


In [235]:
len(data_filtered[(data_filtered['seq2seq_eval'] == True)])

1020

### accuracy

In [236]:
def accuracy(model,data):
    column = model + '_eval'
    eval_ = data[column]
    value_counts =  eval_.value_counts()
    print(value_counts)
    print((len(eval_)))
    acc = value_counts[True]/(len(eval_))
    acc = round(acc,4)
    print('for model {}:'.format(model))
    print('False:', value_counts[False])
    print('True:', value_counts[True])
    print('accuracy:',acc)

In [237]:
accuracy('seq2seq', data)

True     167119
False     15514
Name: seq2seq_eval, dtype: int64
182633
for model seq2seq:
False: 15514
True: 167119
accuracy: 0.9151


In [238]:
accuracy('pymorphy', data)

True     171697
False     10936
Name: pymorphy_eval, dtype: int64
182633
for model pymorphy:
False: 10936
True: 171697
accuracy: 0.9401


In [239]:
accuracy('seq2seq', data_filtered)

False    1025
True     1020
Name: seq2seq_eval, dtype: int64
2045
for model seq2seq:
False: 1025
True: 1020
accuracy: 0.4988


In [240]:
accuracy('pymorphy', data_filtered)

False    1437
True      608
Name: pymorphy_eval, dtype: int64
2045
for model pymorphy:
False: 1437
True: 608
accuracy: 0.2973
