In [None]:
import nltk
from nltk.corpus import treebank
from nltk.corpus import brown
from nltk.corpus import conll2000

nltk.download("treebank")
nltk.download("brown")
nltk.download("conll2000")
nltk.download("universal_tagset")
nltk.download("tagsets")
treebank_corpus = treebank.tagged_sents(tagset="universal")
brown_corpus = brown.tagged_sents(tagset="universal")
conll_corpus = conll2000.tagged_sents(tagset="universal")

tagged_sentences = treebank_corpus + brown_corpus + conll_corpus

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package conll2000 to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


In [None]:
print("data size:", len(tagged_sentences))
tagged_sentences[0]

data size: 72202


[('Pierre', 'NOUN'),
 ('Vinken', 'NOUN'),
 (',', '.'),
 ('61', 'NUM'),
 ('years', 'NOUN'),
 ('old', 'ADJ'),
 (',', '.'),
 ('will', 'VERB'),
 ('join', 'VERB'),
 ('the', 'DET'),
 ('board', 'NOUN'),
 ('as', 'ADP'),
 ('a', 'DET'),
 ('nonexecutive', 'ADJ'),
 ('director', 'NOUN'),
 ('Nov.', 'NOUN'),
 ('29', 'NUM'),
 ('.', '.')]

In [None]:
words = []
tags = []

for sentence in tagged_sentences:
    X_sentence = []
    Y_sentence = []
    for entity in sentence:
        X_sentence.append(entity[0])
        Y_sentence.append(entity[1])

    words.append(X_sentence)
    tags.append(Y_sentence)

In [None]:
print("sample words: ", words[0])
print("sample tags: ", tags[0])

sample words:  ['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.']
sample tags:  ['NOUN', 'NOUN', '.', 'NUM', 'NOUN', 'ADJ', '.', 'VERB', 'VERB', 'DET', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'NOUN', 'NUM', '.']


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np


def set_vocab(words):
    set_words = list(set([t1 for t2 in words for t1 in t2]))
    words_index = np.array(range(len(set_words))) + 1
    # 這裡+1表示index從1開始，因為後續需要將長度不足的句子進行補齊(padding)
    word_to_index = dict(zip(set_words, words_index))
    index_to_word = dict(zip(words_index, set_words))
    return word_to_index, index_to_word

In [None]:
word_to_index, index_to_word = set_vocab(words)
tag_to_index, index_to_tag = set_vocab(tags)

print(tag_to_index)
print(word_to_index)

{'.': np.int64(1), 'CONJ': np.int64(2), 'DET': np.int64(3), 'ADP': np.int64(4), 'PRT': np.int64(5), 'X': np.int64(6), 'PRON': np.int64(7), 'NOUN': np.int64(8), 'ADV': np.int64(9), 'NUM': np.int64(10), 'VERB': np.int64(11), 'ADJ': np.int64(12)}


In [None]:
word_vocab_size = list(index_to_word)[-1] + 1
tag_vocab_size = list(index_to_tag)[-1] + 1
print("word vocabulary size:", word_vocab_size)
print("tag vocabulary size:", tag_vocab_size)

word vocabulary size: 67068
tag vocabulary size: 13


In [None]:
from sklearn.model_selection import train_test_split

# 在切割之前用" "將詞跟詞合併，詞性跟詞性合併
X_train, X_test, y_train, y_test = train_test_split(
    [" ".join(w) for w in words], [" ".join(w) for w in tags], test_size=0.2
)

In [None]:
print("Training data size: %d" % len(X_train))
print("Testing data size: %d" % len(X_test))

Training data size: 57761
Testing data size: 14441


In [None]:
import tensorflow as tf

train_tfdata = tf.data.Dataset.from_tensor_slices((X_train, y_train))
test_tfdata = tf.data.Dataset.from_tensor_slices((X_test, y_test))

In [None]:
# 使用iter與next拿出第一筆資料

x = iter(train_tfdata)
tmp_inp = next(x)
print(tmp_inp)

(<tf.Tensor: shape=(), dtype=string, numpy=b"And let me add Murray's new book as another symptom of it , particularly so in view of the attention Time magazine gave it when it came out recently .">, <tf.Tensor: shape=(), dtype=string, numpy=b'CONJ VERB PRON VERB NOUN ADJ NOUN ADP DET NOUN ADP PRON . ADV ADV ADP NOUN ADP DET NOUN NOUN NOUN VERB PRON ADV PRON VERB PRT ADV .'>)


In [None]:
# 將word和tag使用" "來分開
def encode(word, tag):
    word = [word_to_index[t] for t in word.numpy().decode().split(" ")]
    tag = [tag_to_index[t] for t in tag.numpy().decode().split(" ")]
    return word, tag


# 使用tf.py_function將encode轉換為tf.data
def tf_encode(word, tag):
    return tf.py_function(encode, [word, tag], [tf.int32, tf.int32])

In [None]:
buffer_size = 320
batch_size = 32
padded_shapes = (tf.TensorShape([None]), tf.TensorShape([None]))
train_generator = (
    train_tfdata.map(tf_encode, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    .cache()
    .shuffle(buffer_size)
    .padded_batch(batch_size, padded_shapes=padded_shapes)
    .repeat()
)
test_generator = test_tfdata.map(
    tf_encode, num_parallel_calls=tf.data.experimental.AUTOTUNE
).padded_batch(batch_size, padded_shapes=padded_shapes)

x = iter(train_generator)
tmp_inp = next(x)
print(tmp_inp)

(<tf.Tensor: shape=(32, 58), dtype=int32, numpy=
array([[ 8693, 41386, 15468, ...,     0,     0,     0],
       [ 2767, 57470, 64502, ...,     0,     0,     0],
       [15156, 40670, 41882, ...,     0,     0,     0],
       ...,
       [37992, 65100, 30329, ..., 52315, 60802, 42419],
       [ 3525,   373, 57896, ...,     0,     0,     0],
       [ 6359, 57034,   830, ...,     0,     0,     0]], dtype=int32)>, <tf.Tensor: shape=(32, 58), dtype=int32, numpy=
array([[12,  8, 11, ...,  0,  0,  0],
       [ 8, 11, 10, ...,  0,  0,  0],
       [ 3, 11,  9, ...,  0,  0,  0],
       ...,
       [ 9,  1,  8, ...,  8, 11,  1],
       [ 4,  4,  3, ...,  0,  0,  0],
       [ 3,  8,  8, ...,  0,  0,  0]], dtype=int32)>)


In [None]:
class postag_rnn(tf.keras.Model):
    def __init__(self, embedding_size, rnn_units):
        super().__init__()
        self.embedding = tf.keras.layers.Embedding(
            input_dim=word_vocab_size, output_dim=embedding_size
        )
        # 建立rnn模型
        self.rnn = tf.keras.layers.SimpleRNN(rnn_units, return_sequences=True)
        # 建立輸出層
        output_layer = tf.keras.layers.Dense(units=tag_vocab_size, activation="softmax")
        # 因為這是一個many to many的預測，也就是每個位置都要預測，所以要使用timedistributed，重複利用output_layer
        self.timedistributed = tf.keras.layers.TimeDistributed(output_layer)

    def call(self, x):
        embedded = self.embedding(x)
        hidden_states = self.rnn(embedded)
        outputs = self.timedistributed(hidden_states)
        return outputs

In [None]:
embedding_size = 256
rnn_units = 512

model = postag_rnn(embedding_size, rnn_units)

optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(
    loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"]
)
model.fit(
    train_generator,
    epochs=2,
    validation_data=test_generator,
    steps_per_epoch=len(X_train) // batch_size + 1,
)

loss, accuracy = model.evaluate(test_generator)
print("test dataset's accuracy: {:.2f}".format(accuracy))

Epoch 1/2
[1m1806/1806[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m219s[0m 121ms/step - accuracy: 0.9273 - loss: 0.2434 - val_accuracy: 0.9810 - val_loss: 0.0527
Epoch 2/2
[1m1806/1806[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m213s[0m 118ms/step - accuracy: 0.9843 - loss: 0.0430 - val_accuracy: 0.9800 - val_loss: 0.0585
[1m452/452[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.9805 - loss: 0.0574
test dataset's accuracy: 0.98


In [None]:
# 儲存預測詞性
testing_preds = list()
# 儲存真實詞性tag
testing_true = list()

In [None]:
# 這裡使用兩個迴圈執行預測
# 第一個迴圈預測整個句子
for test in test_generator:
    words, tags = test
    testing_pred = model.predict(words)
    testing_pred_index = np.argmax(testing_pred, axis=-1)

    # 第二個迴圈將預測值以及真實標籤儲存起來
    for i in range(len(tags)):
        testing_preds.append([p for p in testing_pred_index[i] if p != 0])
        testing_true.append([p for p in tags[i].numpy() if p != 0])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 261ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 

In [None]:
# 印出第10筆來測試
print_index = 10
word = X_test[print_index]
pred = testing_preds[print_index]
true = testing_true[print_index]

pred_tag = [index_to_tag[t] for t in pred]
true_tag = [index_to_tag[t] for t in true]

print("Input words: \n", word)
print("Prediction: \n", pred_tag)
print("True: \n", true_tag)

Input words: 
 LOTUS DEVELOPMENT Corp. 's net income rose 61 % in the third quarter from the year-earlier period .
Prediction: 
 ['NOUN', 'NOUN', 'NOUN', 'PRT', 'NOUN', 'NOUN', 'VERB', 'NUM', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', '.']
True: 
 ['NOUN', 'NOUN', 'NOUN', 'PRT', 'ADJ', 'NOUN', 'VERB', 'NUM', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', '.']
