In [1]:
#ASSIGNMENT: 2
#Task-1: Part Of Speech Tagging
#Using GRU with Keras: POS Tagging

In [1]:
#Importing the libraries
from keras.layers.core import Activation, Dense, Dropout, RepeatVector, SpatialDropout1D
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import GRU
from keras.layers.wrappers import TimeDistributed
from keras.models import Sequential
from keras.preprocessing import sequence
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
import collections
import nltk
import numpy as np
import os

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
#Preparing the Data using NLTK Treebank Corpus
DATA_DIR = os.getcwd()
open(os.path.join(DATA_DIR, "treebank_sents.txt"), "w").close()
open(os.path.join(DATA_DIR, "treebank_poss.txt"), "w").close()

fedata = open(os.path.join(DATA_DIR, "treebank_sents.txt"), "a")
ffdata = open(os.path.join(DATA_DIR, "treebank_poss.txt"), "a")
sents = nltk.corpus.treebank.tagged_sents()
for sent in sents:
    words, poss = [], []
    for word, pos in sent:
        if pos == "-NONE-":
            continue
        words.append(word)
        poss.append(pos)
    fedata.write(" ". join(words))
    fedata.write('\n')
    ffdata.write(" ".join(poss))
    ffdata.write('\n')
fedata.close()
ffdata.close()

In [3]:
#Preparing the Vocabularies for the words in the sentences for POS Tagging
def parse_sentences(filename):
    word_freqs = collections.Counter()
    num_recs, maxlen = 0, 0
    fin =open(filename, 'rb')
    for line in fin:
        words = line.strip().lower().split()
        for word in words:
            word_freqs[word] += 1
        if len(words) > maxlen:
            maxlen = len(words)
        num_recs += 1
    fin.close()
    return word_freqs, maxlen, num_recs

s_wordfreqs, s_maxlen, s_numrecs = parse_sentences(os.path.join(DATA_DIR, "treebank_sents.txt"))
t_wordfreqs, t_maxlen, t_numrecs = parse_sentences(os.path.join(DATA_DIR, "treebank_poss.txt"))
# print(len(s_wordfreqs), s_maxlen, s_numrecs, len(t_wordfreqs), t_maxlen, t_numrecs)

In [4]:
#Representing the each row of the input as a sequence of word indices.
MAX_SEQLEN = 250
S_MAX_FEATURES = 5000
T_MAX_FEATURES = 45
s_vocabsize = min(len(s_wordfreqs), S_MAX_FEATURES) + 2
s_word2index = {x[0]:i+2 for i, x in
enumerate(s_wordfreqs.most_common(S_MAX_FEATURES))}
s_word2index["PAD"] = 0
s_word2index["UNK"] = 1
s_index2word = {v:k for k, v in s_word2index.items()}
t_vocabsize = len(t_wordfreqs) + 1
t_word2index = {x[0]:i for i, x in
enumerate(t_wordfreqs.most_common(T_MAX_FEATURES))}
t_word2index["PAD"] = 0
t_index2word = {v:k for k, v in t_word2index.items()}

In [6]:
#Building the data sets to feed to the network and converting the input sentences to into a word sequence.
def build_tensor(filename, numrecs, word2index, maxlen,make_categorical=False, num_classes=0):
    data = np.empty((numrecs, ), dtype=list)
    fin = open(filename, "rb")
    i = 0
    for line in fin:
        wids = []
        for word in line.strip().lower().split():
            if word in word2index:
                wids.append(word2index[word])
            else:
                wids.append(word2index['UNK'])
        if make_categorical:
            data[i] = np_utils.to_categorical(wids,num_classes=num_classes)
        else:
            data[i] = wids
        i += 1
    fin.close()
    pdata = sequence.pad_sequences(data, maxlen=maxlen)
    return pdata

X = build_tensor(os.path.join(DATA_DIR, "treebank_sents.txt"),s_numrecs, s_word2index, MAX_SEQLEN)
Y = build_tensor(os.path.join(DATA_DIR, "treebank_poss.txt"),t_numrecs, t_word2index, MAX_SEQLEN,
                 True, t_vocabsize)

In [8]:
#Splitting the dataset into 80-20 train-test split.
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=42)

In [11]:
#Defining the model
EMBED_SIZE = 90
HIDDEN_SIZE = 60
BATCH_SIZE = 32
NUM_EPOCHS = 2
model = Sequential()
model.add(Embedding(s_vocabsize, EMBED_SIZE,input_length=MAX_SEQLEN))
#model.add(SpatialDropout1D(Dropout(0.2)))
model.add(GRU(HIDDEN_SIZE, dropout=0.2, recurrent_dropout=0.2))
model.add(RepeatVector(MAX_SEQLEN))
model.add(GRU(HIDDEN_SIZE, return_sequences=True))
model.add(TimeDistributed(Dense(t_vocabsize)))
model.add(Activation("softmax"))
model.compile(loss="categorical_crossentropy", optimizer="adam",
metrics=["accuracy"])

In [12]:
#Training the model 
model.fit(Xtrain, Ytrain, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS,
validation_data=[Xtest, Ytest])
score, acc = model.evaluate(Xtest, Ytest, batch_size=BATCH_SIZE)
print("Test score: %.3f, accuracy: %.3f" % (score, acc))

Train on 3131 samples, validate on 783 samples
Epoch 1/2
Epoch 2/2
Test score: 0.292, accuracy: 0.911
