In [19]:
import nltk
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
nltk.download('brown')
nltk.download('universal_tagset')

[nltk_data] Downloading package brown to
[nltk_data]     /Users/manojbhadu/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/manojbhadu/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [8]:
brown_corpus_sent = list(nltk.corpus.brown.tagged_sents(tagset='universal'))

In [12]:
print("Total sentences with tags: ", len(brown_corpus_sent))
print("total tagged words:", len(nltk.corpus.brown.tagged_words()))
brown_corpus_sent[0][1]

Total sentences with tags:  57340
total tagged words: 1161192


('Fulton', 'NOUN')

#### Pre processing

In [13]:
sentences, sentence_tags =[], [] 
for tagged_sentence in brown_corpus_sent:
    sentence, tags = zip(*tagged_sentence)
    sentences.append(np.array(sentence))
    sentence_tags.append(np.array(tags))


In [52]:
training_sentences, test_sentences, training_tags, test_tags = train_test_split(sentences, sentence_tags, test_size=0.2)

In [53]:
vocab = set([])
unique_tags = set([])
for sent in training_sentences:
    for word in sent:
        vocab.add(word.lower())
for sent_tag in training_tags:
    for tag in sent_tag:
        unique_tags.add(tag)

In [54]:
print(len(unique_tags),len(vocab))

12 45079


In [58]:
#Converting words to integar and adding pad and unkonwn words as integar as 0 and 1
word2int = {word: i + 2 for i, word in enumerate(list(vocab))}
word2int['-PAD-'] = 0
word2int['-UNK-'] = 1 
 
tag2int = {tag: i + 1 for i, tag in enumerate(list(unique_tags))}
tag2int['-PAD-'] = 0  

In [59]:
word2int['were'], tag2int["ADJ"]

(38201, 10)

In [64]:
#Convert test and trainig dataset to integer
X_train, X_test, Y_train, Y_test = [],[],[],[]
for sent in training_sentences:
    sent_to_int = []
    for word in sent:
        try:
            sent_to_int.append(word2int[word.lower()])
        except KeyError:
            sent_to_int.append(word2int['-UNK-'])
    X_train.append(sent_to_int)

for sent in test_sentences:
    sent_to_int = []
    for word in sent:
        try:
            sent_to_int.append(word2int[word.lower()])
        except KeyError:
            sent_to_int.append(word2int['-UNK-'])
    X_test.append(sent_to_int)
    
    
for sent_tag in training_tags:
    Y_train.append([tag2int[tag] for tag in sent_tag])
    
for sent_tag in test_tags:
    Y_test.append([tag2int[tag] for tag in sent_tag])

In [73]:
print(X_train[0],"\n",Y_train[0])
print(X_test[0],"\n",Y_test[0])

[38390, 9062, 25010, 31274, 28422, 29954, 29954] 
 [5, 12, 4, 1, 2, 7, 7]
[6165, 36863, 14525, 2287, 31274, 42109, 6864, 13550, 28271, 27483, 31274, 36123, 36061, 38390, 20715, 2287, 31274, 2648, 36123, 4415, 13341, 31274, 42109, 6864, 13550, 28271, 29954, 29954] 
 [4, 3, 12, 6, 1, 7, 2, 7, 4, 6, 1, 2, 7, 5, 12, 6, 1, 2, 4, 11, 12, 1, 7, 2, 7, 4, 7, 7]


In [78]:
#Now we add padding to every sentence for equalling the length to max length of sentences
MAX_LENGTH = len(max(X_train, key=len))
print(MAX_LENGTH) 


180


In [82]:
from keras.preprocessing.sequence import pad_sequences

X_train = pad_sequences(X_train, maxlen=MAX_LENGTH, padding='post')
X_test = pad_sequences(X_test, maxlen=MAX_LENGTH, padding='post')
Y_train = pad_sequences(Y_train, maxlen=MAX_LENGTH, padding='post')
Y_test = pad_sequences(Y_test, maxlen=MAX_LENGTH, padding='post')
 
#print(X_train[0])
#print(X_test[0])
#print(Y_train[0])
#print(Y_test[0])

In [98]:
#we will define the accuracy without consideration of padding beacause if not then accuracy will be high due to 
#prediction of padding tags

from keras import backend as K
 
def no_pad_accuracy(to_ignore=0):
    def ignore_accuracy(y_true, y_pred):
        y_true_class = K.argmax(y_true, axis=-1)
        y_pred_class = K.argmax(y_pred, axis=-1)
 
        ignore_mask = K.cast(K.not_equal(y_pred_class, to_ignore), 'int32')
        matches = K.cast(K.equal(y_true_class, y_pred_class), 'int32') * ignore_mask
        accuracy = K.sum(matches) / K.maximum(K.sum(ignore_mask), 1)
        return accuracy
    return ignore_accuracy

In [99]:
#Now we define the network architecture
#we will use the word embeddins for word to vector
#we have used Bidirectional LSTM, and model return a sequence

#first import library
from keras.models import Sequential
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation
from keras.optimizers import Adam

In [101]:
model = Sequential()
model.add(InputLayer(input_shape=(MAX_LENGTH, )))
model.add(Embedding(len(word2int), 128))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(len(tag2int))))
model.add(Activation('softmax'))
 
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy',no_pad_accuracy(0)])
 
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 180, 128)          5770368   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 180, 512)          788480    
_________________________________________________________________
time_distributed_2 (TimeDist (None, 180, 13)           6669      
_________________________________________________________________
activation_2 (Activation)    (None, 180, 13)           0         
Total params: 6,565,517
Trainable params: 6,565,517
Non-trainable params: 0
_________________________________________________________________


In [102]:
#Convert to one hot vector

def to_categorical(sequences, categories):
    cat_sequences = []
    for s in sequences:
        cats = []
        for item in s:
            cats.append(np.zeros(categories))
            cats[-1][item] = 1.0
        cat_sequences.append(cats)
    return np.array(cat_sequences)
 

In [103]:
cat_train_tags_y = to_categorical(Y_train, len(tag2int))


In [104]:
cat_train_tags_y[0]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [106]:
model.fit(X_train, to_categorical(Y_train, len(tag2int)), batch_size=128, epochs=40, validation_split=0.2)


In [None]:
scores = model.evaluate(X_test, to_categorical(Y_test, len(tag2int)))
print(f"{model.metrics_names[1]}: {scores[1] * 100}")   