In [1]:
from keras.models import Model, model_from_json, load_model
from keras.layers import Input, LSTM, Dense
from keras.callbacks import ModelCheckpoint
import numpy as np
import keras
from tqdm import tqdm
import pandas as pd
import pymorphy2
import matplotlib.pyplot as plt
import seaborn as sns



  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### data

In [8]:
path_to_rfile = 'OpenCorpora_Texts/unamb_sent_14_6.conllu'
path_to_wfile = 'data/eval/eval_OpenCorpora.csv'

In [9]:
#для OpenCorpora
data = pd.read_csv(path_to_rfile, encoding='utf-8',sep='\t',names = ['index','form','lemma','POS','1','gram','2','3','4','5'])

In [10]:
data.shape

(457583, 10)

In [11]:
data = data.loc[data['POS']=='NOUN']
data = data[['lemma','gram','form']]

In [12]:
data.shape

(121793, 3)

In [13]:
def converter(gram):
    grams = gram.split('|')
    dc_gram = {}
    for gram in grams:
        k,v = gram.split('=')
        dc_gram[k] = v
    tag = ''
    if dc_gram['Number'].startswith('S'):
        tag += '1'
    elif dc_gram['Number'].startswith('P'):
        tag += '2'
    else: print(gram)
    tag += dc_gram['Case'][0]
    return tag

In [14]:
for i in range(0,len(data)):
    try:
        data.iloc[i,1] = converter(data.iloc[i,1])
    except Exception as e:
        #print(e)
        #print(data.iloc[[i]])
        data.iloc[[i]] = 0
len(data)   

121793

In [15]:
data = data[data.gram != 0]
len(data)

121282

In [16]:
data['lemma'] = data.apply(lambda row: row['lemma'].lower(), axis=1)

In [17]:
data['form'] = data.apply(lambda row: row['form'].lower(), axis=1)

In [18]:
data['x'] = data.apply(lambda row: row['lemma'] + row['gram'], axis=1)

In [85]:
#data = data[['x','form']]

### checking pymorphy

In [30]:
morph = pymorphy2.MorphAnalyzer()

In [44]:
word = morph.parse('зож')

In [45]:
word

[Parse(word='зож', tag=OpencorporaTag('UNKN'), normal_form='зож', score=1.0, methods_stack=((<UnknAnalyzer>, 'зож'),))]

In [35]:
#table from seq2seq to pymorphy
trans_number = {'1':'sing', '2':'plur'}
trans_case = {'N':'nomn','G':'gent','D':'datv','A':'accs','I':'ablt','L':'loct'}

In [52]:
lemma = "твиттер"
number = 'sing'
case = 'loct'

In [53]:
py_lemmas = morph.parse(lemma)
py_lemma = '-'
for p_lemma in py_lemmas:
    if p_lemma.tag.POS == 'NOUN':
        py_lemma = p_lemma
        break 

In [54]:
py_lemma

Parse(word='твиттер', tag=OpencorporaTag('NOUN,inan,masc sing,nomn'), normal_form='твиттер', score=0.23076923076923075, methods_stack=((<FakeDictionary>, 'твиттер', 33, 0), (<KnownSuffixAnalyzer>, 'иттер')))

In [55]:
py_method = str(py_lemma.methods_stack[0][0])
py_method

'<FakeDictionary>'

In [56]:
prediction = py_lemma.inflect({number, case}).word
prediction

'твиттере'

### checking seq2seq model

In [245]:
from model import predict_form, predict_paradigm

In [246]:
enc = load_model('models/encoder_total.h5')
dec = load_model('models/decoder_total.h5')



In [252]:
predict_paradigm('зашквар',enc,dec)

Unnamed: 0,Singular,Plural
Nom,зашквар,зашквары
Gen,зашквара,зашкваров
Dat,зашквару,зашкварам
Acc,зашквара,зашкваров
Ins,зашкваром,зашкварами
Loc,зашкваре,зашкварах


In [12]:
data.iloc[[35]]

Unnamed: 0,lemma,gram,form,x
143,блок,2G,блоков,блок2G


### looping through data and predicting forms with pymorphy

In [57]:
%%time
pymorphy_predictions = []
pymorphy_eval = []
pymorphy_exeptions = []
pymorphy_methods = []
for row in range(len(data)):
    lemma = data.iloc[row,0]
    #print(row)
    #print(data.iloc[row,1])
    number,case = str(data.iloc[row,1])
    number = trans_number[number]
    case = trans_case[case]
    py_lemmas = morph.parse(lemma)
    py_lemma = '-'
    for p_lemma in py_lemmas:
        if p_lemma.tag.POS == 'NOUN':
            py_lemma = p_lemma
            break 
    try:
        py_method = str(py_lemma.methods_stack[0][0])
        prediction = py_lemma.inflect({number, case}).word        
    except Exception as e:
        pymorphy_exeptions.append((number,case,lemma))
        prediction =  '-'
        py_method = '-'      
        #print('{} {}({})'.format(number,case,lemma))  
    pymorphy_methods.append(py_method)
    pymorphy_predictions.append(prediction)
    true_form = data.iloc[row,2]
    if prediction == true_form:
        pymorphy_eval.append('True')
    else:
        pymorphy_eval.append('False')

Wall time: 43.7 s


In [59]:
data['pymorphy_predictions'] = pymorphy_predictions
data['pymorphy_eval'] = pymorphy_eval
data['pymorphy_methods'] = pymorphy_methods

### predicting forms with seq2seq

In [20]:
%%time
seq2seq_predictions = []
seq2seq_eval = []
for row in tqdm(range(len(data))):
#for row in tqdm(range(100)):
    inp = data.iloc[row,3]
    out = predict_form(inp,enc,dec).rstrip()
    seq2seq_predictions.append(out)
    if out == data.iloc[row,2]:
        seq2seq_eval.append('True')
    else:
        seq2seq_eval.append('False')

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:01<00:00, 52.23it/s]


Wall time: 1.92 s


In [23]:
data['seq2seq_predictions'] = seq2seq_predictions
data['seq2seq_eval'] = seq2seq_eval


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [20]:
#uploading results from file
seq2seq_predictions = open('testing/model_predict_full_letters.txt','r',encoding = 'utf-8').read().split('\n')
data['seq2seq_predictions'] = seq2seq_predictions
seq2seq_eval = []
for row in range(data.shape[0]):
    true_form = data.iloc[row,4]
    prediction = data.iloc[row,7]
    if prediction == true_form:
        seq2seq_eval.append('True')
    else:
        seq2seq_eval.append('False')
data['seq2seq_eval'] = seq2seq_eval      

In [64]:
data.to_csv(path_to_wfile, encoding='utf-8',sep='\t', index=False)
#writer = pd.ExcelWriter(path_to_wfile.replace('.csv', '.xlsx'))
#data.to_excel(writer,'Sheet1')
#writer.save()
#для замеров аккуратности без учета "ошибок" на неизменяемых, pl.tantum, sg.tantum
#data_filtered = data[data.apply(lambda x: not x['classtag'].endswith(('-','+','0')), axis=1)] 
#data_filtered.to_csv(path_to_wfile_filtered, encoding='utf-8',sep='\t', index=False)

In [19]:
data = pd.read_csv(path_to_wfile, encoding='utf-8',sep='\t')

In [20]:
data['pymorphy_predictions'] = data['pymorphy_predictions'].apply(lambda x: x.replace("ё","е"))

In [21]:
data['seq2seq_predictions'] = data['seq2seq_predictions'].apply(lambda x: x.replace("ё","е"))

In [22]:
# не словарные слова для pymorphy
data_filtered = data[data.apply(lambda x: x['pymorphy_methods'] != '<DictionaryAnalyzer>', axis=1)]

In [23]:
len(data)

121282

In [24]:
len(data_filtered)

180

In [25]:
data_filtered

Unnamed: 0,lemma,gram,form,x,pymorphy_predictions,pymorphy_eval,seq2seq_predictions,seq2seq_eval,pymorphy_methods
921,нары,2N,нары,нары2N,-,False,нары,True,-
3767,зож,1N,зож,зож1N,-,False,зож,True,-
3771,зож,1D,зож,зож1D,-,False,зожу,False,-
3773,зож,1G,зож,зож1G,-,False,зожа,False,-
5136,мкм,1G,мкм,мкм1G,-,False,мкма,False,-
5234,мкм,1G,мкм,мкм1G,-,False,мкма,False,-
5645,пара,2G,пар,пара2G,-,False,пар,True,-
5959,фастфуд,1G,фастфуда,фастфуд1G,фастфуда,True,фастфуда,True,<FakeDictionary>
6389,гк,1G,гк,гк1G,-,False,гна,False,-
7154,бутса,1G,бутсы,бутса1G,бутса,False,бутсы,True,<FakeDictionary>


In [28]:
data_filtered[(data_filtered['pymorphy_eval'] == False) & (data_filtered['seq2seq_eval'] == True)]

Unnamed: 0,lemma,gram,form,x,pymorphy_predictions,pymorphy_eval,seq2seq_predictions,seq2seq_eval,pymorphy_methods
921,нары,2N,нары,нары2N,-,False,нары,True,-
3767,зож,1N,зож,зож1N,-,False,зож,True,-
5645,пара,2G,пар,пара2G,-,False,пар,True,-
7154,бутса,1G,бутсы,бутса1G,бутса,False,бутсы,True,<FakeDictionary>
7549,спо,1G,спо,спо1G,-,False,спо,True,-
7563,спо,1N,спо,спо1N,-,False,спо,True,-
9665,шт,2G,шт,шт2G,-,False,шт,True,-
11588,уста,2L,устах,уста2L,-,False,устах,True,-
20971,шт,1N,шт,шт1N,-,False,шт,True,-
20972,шт,1N,шт,шт1N,-,False,шт,True,-


In [75]:
len(data_filtered[(data_filtered['seq2seq_eval'] == False)])

76

### accuracy

In [26]:
def accuracy(model,data):
    column = model + '_eval'
    eval_ = data[column]
    value_counts =  eval_.value_counts()
    print(value_counts)
    print((len(eval_)))
    acc = value_counts[True]/(len(eval_))
    acc = round(acc,4)
    print('for model {}:'.format(model))
    print('False:', value_counts[False])
    print('True:', value_counts[True])
    print('accuracy:',acc)

In [27]:
accuracy('seq2seq', data)

True     114376
False      6906
Name: seq2seq_eval, dtype: int64
121282
for model seq2seq:
False: 6906
True: 114376
accuracy: 0.9431


In [70]:
accuracy('pymorphy', data)

True     117414
False      3868
Name: pymorphy_eval, dtype: int64
121282
for model pymorphy:
False: 3868
True: 117414
accuracy: 0.9681


In [71]:
accuracy('seq2seq', data_filtered)

True     104
False     76
Name: seq2seq_eval, dtype: int64
180
for model seq2seq:
False: 76
True: 104
accuracy: 0.5778


In [72]:
accuracy('pymorphy', data_filtered)

False    135
True      45
Name: pymorphy_eval, dtype: int64
180
for model pymorphy:
False: 135
True: 45
accuracy: 0.25
