In [0]:
!git clone https://github.com/IlyaSk10/POS_Tagger_with_using_CNN/
%cd POS_Tagger_with_using_CNN
!unzip SYNTAGRUS_texts.zip
!head syntagrus_full.ud
from data_preparation.converter_from_ud_to_txt import UDConverter
UDConverter.convert_from_conllu("syntagrus_full.ud", "syntagrus_fixed.txt")
!head syntagrus_fixed.txt
text_data= "syntagrus_fixed.txt"

Cloning into 'POS_Tagger_with_using_CNN'...
remote: Enumerating objects: 31, done.[K
remote: Counting objects: 100% (31/31), done.[K
remote: Compressing objects: 100% (30/30), done.[K
remote: Total 31 (delta 10), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (31/31), done.
/content/POS_Tagger_with_using_CNN
Archive:  SYNTAGRUS_texts.zip
  inflating: syntagrus_full.ud       
1	Начальник	начальник	NOUN	Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing
2	областного	областной	ADJ	Case=Gen|Degree=Pos|Gender=Neut|Number=Sing
3	управления	управление	NOUN	Animacy=Inan|Case=Gen|Gender=Neut|Number=Sing
4	связи	связь	NOUN	Animacy=Inan|Case=Gen|Gender=Fem|Number=Sing
5	Семен	семен	NOUN	Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing
6	Еремеевич	еремеевич	NOUN	Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing
7	был	быть	VERB	Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act
8	человек	человек	NOUN	Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing
9	простой	простой	ADJ	Case=Nom|D

In [0]:
def load_from_file(file):
  with open(file,'r',encoding='utf-8') as f:
    c=f.read().split('\n')
  return c

In [0]:
dataset=load_from_file(text_data)

In [0]:
dataset=[line for line in dataset if line!='']
MAX_LEN_TOKEN=max(len(line.split('\t')[0]) for line in dataset)
NUMBER_UNIQUE_TOKEN=len(set(line.split('\t')[0] for line in dataset if line.split('\t')[2]!='PUNCT'))
print('Максимальная длина токена', MAX_LEN_TOKEN)
print('Количество уникальных токенов' , NUMBER_UNIQUE_TOKEN)

Максимальная длина токена 31
Количество уникальных токенов 112875


In [0]:
labels = ['<NOTAG>'] + sorted({token.split('\t')[2] for token in dataset})
label2id = {label: i+1 for i, label in enumerate(labels)}
print('Метки частей речи' , label2id)

Метки частей речи {'<NOTAG>': 1, 'ADJ': 2, 'ADP': 3, 'ADV': 4, 'CONJ': 5, 'DET': 6, 'H': 7, 'INTJ': 8, 'NOUN': 9, 'NUM': 10, 'PART': 11, 'PRON': 12, 'PUNCT': 13, 'VERB': 14}


In [0]:
from collections import Counter
def build_vocab(text,pad_symbol='<PAD>'):
  sentence=' '.join(line.split('\t')[0] for line in text)
  mydict = dict((j, i+1) for i, j in enumerate(set(sentence)))
  mydict.update({pad_symbol:max(mydict.values())+1})
  return mydict

def counter(dataset,most_freq_symbols):
  if most_freq_symbols is None:
    most_freq_symbols=5
  sentence=' '.join([word.split('\t')[0] for word in dataset])
  return Counter(list(sentence)).most_common(most_freq_symbols)    


In [20]:
print('Символы словаря',build_vocab(text=dataset[:10]),'\n','Наиболее частотные символы',counter(dataset[:10],most_freq_symbols=10))

Символы словаря {'ы': 1, 'к': 2, 'л': 3, 'о': 4, 'а': 5, 'т': 6, 'й': 7, ',': 8, 'ь': 9, 'в': 10, 'ч': 11, 'я': 12, 'б': 13, 'е': 14, 'Н': 15, 'з': 16, 'п': 17, 'н': 18, 'С': 19, 'Е': 20, 'у': 21, 'г': 22, ' ': 23, 'р': 24, 'с': 25, 'м': 26, 'и': 27, '<PAD>': 28} 
 Наиболее частотные символы [(' ', 9), ('е', 8), ('о', 6), ('л', 5), ('а', 4), ('н', 4), ('и', 4), ('в', 4), ('ч', 3), ('с', 3)]


In [0]:
vocab=build_vocab(text=dataset[:10])

word2id=[]
label_to_id=[]
for index in range(len(dataset)):
  word2id.append([number for j in dataset[index].split('\t')[0] for symbol,number in vocab.items() if symbol==j.lower()])
  label_to_id.append([number for symbol,number in label2id.items() if symbol==dataset[index].split('\t')[2]])

In [0]:
import numpy as np

def pad(word2id,pad_symbol,max_len_token):
  mat_with_pads=np.zeros(shape=(len(word2id),MAX_LEN_TOKEN),dtype=int)
  for index,ident in enumerate(word2id):
    np.put(mat_with_pads[index],range(len(ident)),ident)
  return mat_with_pads

In [0]:
words=pad(word2id,pad_symbol=0,max_len_token=MAX_LEN_TOKEN)

In [34]:
print('Преобразование слов в набор симвлолов для {} слов'.format(10),'\n',words[:10])
print('Преобразование частей речи в метки для {} меток'.format(10),'\n',label_to_id[:10])

Преобразование слов в набор симвлолов для 10 слов 
 [[18  5 11  5  3  9 18 27  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0]
 [ 4 13  3  5 25  6 18  4 22  4  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0]
 [21 17 24  5 10  3 14 18 27 12  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0]
 [25 10 12 16 27  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0]
 [25 14 26 14 18  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0]
 [14 24 14 26 14 14 10 27 11  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0]
 [13  1  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0]
 [11 14  3  4 10 14  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0]
 [17 24  4 25  6  4  7  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0]
 [ 8  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  

In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(words, label_to_id, test_size=0.33, random_state=42)

from keras.utils import to_categorical

y_train_cat=to_categorical(y_train, num_classes=len(labels)+1)
y_test_cat=to_categorical(y_test, num_classes=len(labels)+1)

Using TensorFlow backend.


In [0]:
from keras.models import Sequential
from keras.layers import Dense, Activation,Embedding,Bidirectional,LSTM

# создание сети
model = Sequential()
vocab_size=NUMBER_UNIQUE_TOKEN
num_classes=len(labels)+1
model.add(Embedding(vocab_size, output_dim= MAX_LEN_TOKEN, input_length=MAX_LEN_TOKEN))
model.add(Bidirectional(LSTM(units=32, input_shape=(NUMBER_UNIQUE_TOKEN, MAX_LEN_TOKEN),dropout = 0.2)))
model.add(Dense(units=num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# обучение модели
model.fit(X_train, y_train_cat, batch_size = 64,epochs=1,validation_data = (X_test, y_test_cat),verbose = 1)