<a href="https://colab.research.google.com/github/lnpetrova/comp_ling/blob/master/ML_hw8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Импорты**

In [2]:
import tensorflow as tf
from tensorflow.keras.layers import Activation, LSTM, Bidirectional, TimeDistributed, InputLayer, Embedding, Input, Dropout, Flatten, Conv1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model

import numpy as np
from itertools import chain
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from collections import Counter
import nltk
nltk.download('punkt')
nltk.download('treebank')
from string import punctuation
punct = punctuation+'«»—…“”*№–'

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


In [3]:
from numpy.random import seed
seed(0)
from tensorflow.random import set_seed
set_seed(0)

In [5]:
tagged_sentences = nltk.corpus.treebank.tagged_sents()

In [6]:
tagged_sentences[0]

[('Pierre', 'NNP'),
 ('Vinken', 'NNP'),
 (',', ','),
 ('61', 'CD'),
 ('years', 'NNS'),
 ('old', 'JJ'),
 (',', ','),
 ('will', 'MD'),
 ('join', 'VB'),
 ('the', 'DT'),
 ('board', 'NN'),
 ('as', 'IN'),
 ('a', 'DT'),
 ('nonexecutive', 'JJ'),
 ('director', 'NN'),
 ('Nov.', 'NNP'),
 ('29', 'CD'),
 ('.', '.')]

In [7]:
sentences, sentence_tags =[], [] 
for tagged_sentence in tagged_sentences:
    sentence, tags = zip(*tagged_sentence)
    sentences.append(sentence)
    sentence_tags.append(tags)

In [8]:
sentence_tags[0]

('NNP',
 'NNP',
 ',',
 'CD',
 'NNS',
 'JJ',
 ',',
 'MD',
 'VB',
 'DT',
 'NN',
 'IN',
 'DT',
 'JJ',
 'NN',
 'NNP',
 'CD',
 '.')

In [9]:
sent_train, sent_test, tag_train, tag_test = train_test_split(sentences, sentence_tags, test_size=0.2, random_state=0)

In [10]:
vocab = Counter()
for sent in sent_train:
    sent = [word.lower() for word in sent]
    vocab.update(sent)

In [11]:
filtered_vocab = {word for word in vocab if vocab[word] > 5}

In [12]:
len(filtered_vocab)

1679

In [13]:
word2id = {'PAD':0,'UNK':1}    
for i,word in enumerate(filtered_vocab):
      word2id[word] = i + 2

id2word = {i:word for word, i in word2id.items()}

In [14]:
tag2id = {'PAD':0}  
for tags in tag_train:
    for tag in tags:
      if tag.lower() not in tag2id:
        tag2id[tag.lower()] = len(tag2id)

id2tag = {i:tag for tag, i in tag2id.items()}

In [15]:
len(id2tag)

47

In [16]:
def data2ints(data, smth2id):
  int_data = []
  for seq in data:
      int_seq = []
      for i in seq:
          try:
            int_seq.append(smth2id[i.lower()])
          except KeyError:
            int_seq.append(smth2id['UNK'])
  
      int_data.append(int_seq)
  return int_data

In [17]:
X_train_ids, X_test_ids = data2ints(sent_train, word2id), data2ints(sent_test, word2id)
y_train_ids, y_test_ids = data2ints(tag_train, tag2id), data2ints(tag_test, tag2id)


print(X_train_ids[0])
print(X_test_ids[0])
print(y_train_ids[0])
print(y_test_ids[0])

[111, 1, 1, 1, 1288, 1, 1115, 1225, 1286]
[1680, 631, 1485, 320, 796, 1105, 790, 1, 1616, 1, 1067, 169, 1352, 1286]
[1, 1, 1, 2, 1, 1, 3, 4, 5]
[18, 19, 21, 24, 10, 25, 24, 18, 21, 14, 3, 7, 15, 5]


In [18]:
MAX_LEN = max(len(x) for x in X_train_ids)

In [19]:
X_train, X_test = pad_sequences(X_train_ids, maxlen=MAX_LEN, padding='post'), pad_sequences(X_test_ids, maxlen=MAX_LEN, padding='post')
y_train_pad, y_test_pad = pad_sequences(y_train_ids, maxlen=MAX_LEN, padding='post'), pad_sequences(y_test_ids, maxlen=MAX_LEN, padding='post')

In [20]:
print(X_train.shape, y_train_pad.shape, X_test.shape, y_test_pad.shape)

(3131, 128) (3131, 128) (783, 128) (783, 128)


In [21]:
y_train, y_test = to_categorical(y_train_pad, num_classes=len(tag2id)), to_categorical(y_test_pad, num_classes=len(tag2id))

In [22]:
y_test.shape

(783, 128, 47)

In [23]:
chars = Counter()
for sent in sent_train:
  for word in sent:
    word = [char.lower() for char in word]
    chars.update(word)

In [24]:
char2id = {'PAD':0,'UNK':1}    
for i,char in enumerate(chars):
      char2id[char] = i + 2

id2char = {i:char for char, i in char2id.items()}

In [25]:
MAX_CHAR_LEN = max(max(len(word) for word in sent) for sent in sent_train)
MAX_CHAR_LEN 

24

In [26]:
X_train_ids_char = [data2ints(sent, char2id) for sent in sent_train]
X_test_ids_char = [data2ints(sent, char2id) for sent in sent_test]

In [27]:
X_train_char = pad_sequences([pad_sequences(ids, maxlen=MAX_CHAR_LEN, padding='post') for ids in X_train_ids_char], maxlen=MAX_LEN, padding='post')
X_test_char = pad_sequences([pad_sequences(ids, maxlen=MAX_CHAR_LEN, padding='post') for ids in X_test_ids_char], maxlen=MAX_LEN, padding='post')

In [28]:
print(X_train_char.shape, X_test_char.shape)

(3131, 128, 24) (783, 128, 24)


In [29]:
from tensorflow.keras.layers import Concatenate, Dense

In [30]:
input1 = Input(shape=(MAX_LEN,))
embeddings1 = Embedding(input_dim=len(word2id), output_dim=100, mask_zero=True)(input1)
bilstm1 = Bidirectional(LSTM(128, return_sequences=True))(embeddings1)
drop1 = Dropout(0.2)(bilstm1)

input2 = Input(shape=(MAX_LEN, MAX_CHAR_LEN,))
embeddings2 = TimeDistributed(Embedding(len(char2id),output_dim=40))(input2)
conv = TimeDistributed(tf.keras.layers.Conv1D(25, 3, 1))(embeddings2)
flat = TimeDistributed(Flatten(), name='flat')(conv)
drop2 = Dropout(0.2)(flat)
drops = Concatenate()([drop1, drop2])

bi = Bidirectional(LSTM(128, return_sequences=True))(drops)
outputs = TimeDistributed(Dense(len(tag2id), activation='softmax'))(bi)

model = tf.keras.Model(inputs=[input1, input2], outputs=outputs)
model.compile(optimizer='Adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [31]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 128, 24)]    0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
time_distributed (TimeDistribut (None, 128, 24, 40)  2160        input_2[0][0]                    
__________________________________________________________________________________________________
embedding (Embedding)           (None, 128, 100)     168100      input_1[0][0]                    
______________________________________________________________________________________________

In [33]:
model.fit([X_train, X_train_char], y_train, 
          validation_data=([X_test, X_test_char], y_test),
          batch_size=64,
          epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f5b8bc90cf8>

# **Функция для проверки**

In [66]:
def checking(text, model=model):
    chars = [[char2id.get(char.lower(), 1) for char in word] for word in text]
    sent = [word2id.get(word.lower(), 1) for word in text]
    pad_chars = pad_sequences([pad_sequences(ids, maxlen=MAX_CHAR_LEN, padding='post') for ids in [chars]], maxlen=MAX_LEN, padding='post')   
    pad_sent = pad_sequences([sent], maxlen=MAX_LEN, padding='post')
    predicted_tags = model.predict([pad_sent, pad_chars])
    predicted_tags = [id2tag[tag] for tag in np.argmax(predicted_tags, axis=2)[0, :len(text)]]
    tagged_sent= list(zip(text, predicted_tags))
    return tagged_sent

In [67]:
checking(sent_test[0], model)

[('You', 'prp'),
 ('do', 'vbp'),
 ("n't", 'rb'),
 ('want', 'vb'),
 ('*-1', '-none-'),
 ('to', 'to'),
 ('get', 'vb'),
 ('yourself', 'nn'),
 ('too', 'rb'),
 ('upset', 'vbn'),
 ('about', 'in'),
 ('these', 'dt'),
 ('things', 'nns'),
 ('.', '.')]

In [68]:
checking(sent_test[5], model)

[('Mr.', 'nnp'),
 ('Reupke', 'nnp'),
 ('was', 'vbd'),
 ('one', 'cd'),
 ('of', 'in'),
 ('three', 'cd'),
 ('executives', 'nns'),
 ('on', 'in'),
 ('Reuters', 'nnp'),
 ("'s", 'pos'),
 ('eight-person', 'jj'),
 ('executive', 'jj'),
 ('committee', 'nn'),
 ('who', 'wp'),
 ('*T*-1', '-none-'),
 ('did', 'vbd'),
 ("n't", 'rb'),
 ('also', 'rb'),
 ('serve', 'vb'),
 ('on', 'in'),
 ('the', 'dt'),
 ('company', 'nn'),
 ("'s", 'pos'),
 ('board', 'nn'),
 ('of', 'in'),
 ('directors', 'nns'),
 ('.', '.')]