In [1]:
import pandas as pd
import numpy as np
import nltk
import tensorflow as tf
import tensorflow_text as text
import tensorflow_hub as hub
import tkinter as tk

#### Download the following Corpus - treebank, brown, conll2000

In [2]:
nltk.download('treebank')
nltk.download('brown')
nltk.download('conll2000')

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\D\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\D\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package conll2000 to
[nltk_data]     C:\Users\D\AppData\Roaming\nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!


True

In [3]:
nltk.download('universal_tagset')

[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\D\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [4]:
from nltk.corpus import treebank,brown,conll2000

In [5]:
tagged_sents = treebank.tagged_sents(tagset='universal')+brown.tagged_sents(tagset='universal')+conll2000.tagged_sents(tagset='universal')

In [6]:
print(f"Dataset Size : {len(tagged_sents)}\n")
print(f"Example : {tagged_sents[10]}")

Dataset Size : 72202

Example : [('Neither', 'DET'), ('Lorillard', 'NOUN'), ('nor', 'CONJ'), ('the', 'DET'), ('researchers', 'NOUN'), ('who', 'PRON'), ('*T*-3', 'X'), ('studied', 'VERB'), ('the', 'DET'), ('workers', 'NOUN'), ('were', 'VERB'), ('aware', 'ADJ'), ('of', 'ADP'), ('any', 'DET'), ('research', 'NOUN'), ('on', 'ADP'), ('smokers', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('Kent', 'NOUN'), ('cigarettes', 'NOUN'), ('.', '.')]


In [7]:
sentences,tags=[],[]

for s in tagged_sents:
    
    sentence,tag = zip(*s)
    sentences.append(list(sentence))
    tags.append(list(tag))

In [8]:
len(sentences),len(tags)

(72202, 72202)

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
train_ratio = 0.75
test_ratio = 0.15
val_ratio = 0.10

x_train, x_test, y_train, y_test = train_test_split(sentences, tags, test_size=1-train_ratio, random_state=42)

x_val,x_test,y_val,y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio+val_ratio), random_state=42)

In [11]:
print(f"x_train size : {len(x_train)}, y_train size : {len(y_train)}")
print(f"x_test size : {len(x_test)}, y_test size : {len(y_test)}")
print(f"x_val size : {len(x_val)}, y_val size : {len(y_val)}")

x_train size : 54151, y_train size : 54151
x_test size : 10831, y_test size : 10831
x_val size : 7220, y_val size : 7220


In [12]:
sentence_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token="<OOV>")

In [13]:
sentence_tokenizer.fit_on_texts(x_train)

In [14]:
x_train_seqs = sentence_tokenizer.texts_to_sequences(x_train)
print(f"Text : {x_train[5]} \nSeq : {x_train_seqs[5]}")

Text : ['And', 'she', 'replied', ',', '``', 'I', 'was', 'born', 'in', 'America', ',', 'but', 'I', 'was', 'conceived', 'in', 'Vienna', "''", '.'] 
Seq : [6, 55, 2359, 3, 13, 30, 14, 1193, 9, 502, 3, 31, 30, 14, 4467, 9, 7376, 15, 4]


In [15]:
print(f"Seq : {x_train_seqs[5]} \nText : {sentence_tokenizer.sequences_to_texts(x_train_seqs)[5]}")

Seq : [6, 55, 2359, 3, 13, 30, 14, 1193, 9, 502, 3, 31, 30, 14, 4467, 9, 7376, 15, 4] 
Text : and she replied , `` i was born in america , but i was conceived in vienna '' .


In [16]:
tag_tokenizer = tf.keras.preprocessing.text.Tokenizer()
tag_tokenizer.fit_on_texts(y_train)
tag_tokenizer.word_index

{'noun': 1,
 'verb': 2,
 '.': 3,
 'adp': 4,
 'det': 5,
 'adj': 6,
 'adv': 7,
 'pron': 8,
 'conj': 9,
 'prt': 10,
 'num': 11,
 'x': 12}

In [17]:
y_train_seqs = tag_tokenizer.texts_to_sequences(y_train)
print(f"Text : {y_train[5]} \nSeq : {y_train_seqs[5]}")

Text : ['CONJ', 'PRON', 'VERB', '.', '.', 'PRON', 'VERB', 'VERB', 'ADP', 'NOUN', '.', 'CONJ', 'PRON', 'VERB', 'VERB', 'ADP', 'NOUN', '.', '.'] 
Seq : [9, 8, 2, 3, 3, 8, 2, 2, 4, 1, 3, 9, 8, 2, 2, 4, 1, 3, 3]


In [18]:
x_test_seqs = sentence_tokenizer.texts_to_sequences(x_test)
y_test_seqs = tag_tokenizer.texts_to_sequences(y_test)

x_val_seqs = sentence_tokenizer.texts_to_sequences(x_val)
y_val_seqs = tag_tokenizer.texts_to_sequences(y_val)

In [19]:
x_train_padded = tf.keras.preprocessing.sequence.pad_sequences(sequences=x_train_seqs,
                                                               maxlen=271,
                                                               padding='post',
                                                               truncating='post')

In [20]:
y_train_padded = tf.keras.preprocessing.sequence.pad_sequences(sequences=y_train_seqs,
                                                               maxlen=271,
                                                               padding='post',
                                                               truncating='post')

In [21]:
y_train_padded[:5]

array([[10,  2,  1, ...,  0,  0,  0],
       [ 8,  2,  4, ...,  0,  0,  0],
       [ 1,  1,  2, ...,  0,  0,  0],
       [ 8,  2,  5, ...,  0,  0,  0],
       [ 4,  5,  1, ...,  0,  0,  0]])

In [22]:
y_test_padded = tf.keras.preprocessing.sequence.pad_sequences(sequences=y_test_seqs,
                                                               maxlen=271,
                                                               padding='post',
                                                               truncating='post')

y_val_padded = tf.keras.preprocessing.sequence.pad_sequences(sequences=y_val_seqs,
                                                               maxlen=271,
                                                               padding='post',
                                                               truncating='post')

x_test_padded = tf.keras.preprocessing.sequence.pad_sequences(sequences=x_test_seqs,
                                                               maxlen=271,
                                                               padding='post',
                                                               truncating='post')

x_val_padded = tf.keras.preprocessing.sequence.pad_sequences(sequences=x_val_seqs,
                                                               maxlen=271,
                                                               padding='post',
                                                               truncating='post')

In [23]:
len(x_train_padded[10]),len(x_test_padded[10]),len(x_val_padded[10])

(271, 271, 271)

In [24]:
len(y_train_padded[10]),len(y_test_padded[10]),len(y_val_padded[10])

(271, 271, 271)

In [25]:
y_train_cat = tf.keras.utils.to_categorical(y_train_padded)
y_test_cat = tf.keras.utils.to_categorical(y_test_padded)
y_val_cat = tf.keras.utils.to_categorical(y_val_padded)

In [26]:
num_tokens = len(sentence_tokenizer.word_index) + 1
num_classes = len(tag_tokenizer.word_index) + 1

In [27]:
num_tokens,num_classes

(51891, 13)

In [28]:
from tensorflow.keras import layers

In [29]:
model = tf.keras.Sequential()
model.add(layers.Embedding(input_dim=num_tokens,output_dim=128,input_length=271,mask_zero=True))
model.add(layers.Bidirectional(layers.LSTM(128,return_sequences=True,)))
model.add(layers.Dense(num_classes,activation='softmax'))

In [30]:
model.compile(optimizer='Adam',loss='categorical_crossentropy',metrics=['accuracy'])

In [31]:
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss',patience=2)
model.fit(x_train_padded,y_train_cat,epochs=5,validation_data=(x_val_padded,y_val_cat),callbacks=[es])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x221d289d910>

In [46]:
res = model.predict(x_test_padded[0].reshape(1,-1))



In [53]:
[np.argmax(x) for x in res[-1][:len(x_test[0])]]

[4, 6, 1, 8, 2, 10, 3, 11, 9, 6, 1, 4, 3, 11, 9, 1, 1, 6, 1, 3]

In [50]:
import pickle

In [51]:
tf.keras.models.save_model(model,'pos_predictor.keras')
with open('tokenizer.pickle','wb') as file:
    pickle.dump(sentence_tokenizer,file)