In [23]:
import nltk
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [24]:
nltk.download('brown')
nltk.download('universal_tagset')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [25]:
brown_corpus_sent = list(nltk.corpus.brown.tagged_sents(tagset='universal'))

In [26]:
print("Total sentences with tags: ", len(brown_corpus_sent))
print("total tagged words:", len(nltk.corpus.brown.tagged_words()))
brown_corpus_sent[0][1]

Total sentences with tags:  57340
total tagged words: 1161192


('Fulton', 'NOUN')

#### Pre processing

In [27]:
sentences, sentence_tags =[], [] 
for tagged_sentence in brown_corpus_sent:
    sentence, tags = zip(*tagged_sentence)
    sentences.append(np.array(sentence))
    sentence_tags.append(np.array(tags))


In [28]:
num_sents = len(sentences)
k = 5
foldsize = int(num_sents/5)


k_folds = {}
for i in range(5):
    # Locate the test set in the fold.
    k_folds["test_sent{0}".format(i)] = sentences[i*foldsize:i*foldsize+foldsize]
    k_folds["train_sent{0}".format(i)] = sentences[:i*foldsize] + sentences[i*foldsize+foldsize:]
    k_folds["test_tags{0}".format(i)] = sentence_tags[i*foldsize:i*foldsize+foldsize]
    k_folds["train_tags{0}".format(i)] = sentence_tags[:i*foldsize] + sentence_tags[i*foldsize+foldsize:]

In [29]:
training_sentences, test_sentences, training_tags, test_tags = k_folds['train_sent4'],k_folds['test_sent4'],k_folds['train_tags4'],k_folds['test_tags4']

In [30]:
vocab = set([])
unique_tags = set([])
for sent in training_sentences:
    for word in sent:
        vocab.add(word.lower())
for sent_tag in training_tags:
    for tag in sent_tag:
        unique_tags.add(tag)

In [33]:
print(len(unique_tags),len(vocab))

12 45755


In [34]:
#Converting words to integer and adding pad and unkonwn words as integer as 0 and 1
word2int = {word: i + 2 for i, word in enumerate(list(vocab))}
word2int['-PAD-'] = 0
word2int['-UNK-'] = 1 
 
tag2int = {tag: i + 1 for i, tag in enumerate(list(unique_tags))}
tag2int['-PAD-'] = 0  

In [35]:
word2int['were'], tag2int["ADJ"]

(28154, 8)

In [36]:
#Convert test and trainig dataset to integer
X_train, X_test, Y_train, Y_test = [],[],[],[]
for sent in training_sentences:
    sent_to_int = []
    for word in sent:
        try:
            sent_to_int.append(word2int[word.lower()])
        except KeyError:
            sent_to_int.append(word2int['-UNK-'])
    X_train.append(sent_to_int)

for sent in test_sentences:
    sent_to_int = []
    for word in sent:
        try:
            sent_to_int.append(word2int[word.lower()])
        except KeyError:
            sent_to_int.append(word2int['-UNK-'])
    X_test.append(sent_to_int)
    
    
for sent_tag in training_tags:
    Y_train.append([tag2int[tag] for tag in sent_tag])
    
for sent_tag in test_tags:
    Y_test.append([tag2int[tag] for tag in sent_tag])

In [37]:
print(X_train[0],"\n",Y_train[0])
print(X_test[0],"\n",Y_test[0])

[39888, 23054, 43783, 5751, 42582, 25707, 2961, 7128, 36824, 13516, 31658, 38472, 2281, 2999, 12109, 32679, 33486, 41231, 6641, 21442, 44434, 8826, 24664, 17828, 3941] 
 [10, 12, 12, 8, 12, 11, 12, 10, 12, 3, 12, 8, 12, 12, 11, 1, 10, 12, 1, 3, 10, 12, 11, 12, 1]
[39888, 27669, 41019, 4439, 45678, 18030, 17938, 41784, 43276, 31090, 26831, 21736, 7791, 38192, 11462, 3941] 
 [10, 12, 11, 3, 10, 8, 12, 1, 2, 9, 11, 11, 9, 3, 9, 1]


In [38]:
#Now we add padding to every sentence for equalling the length to max length of sentences
MAX_LENGTH = len(max(X_train, key=len))
print(MAX_LENGTH) 


180


In [39]:
from keras.preprocessing.sequence import pad_sequences

X_train = pad_sequences(X_train, maxlen=MAX_LENGTH, padding='post')
X_test = pad_sequences(X_test, maxlen=MAX_LENGTH, padding='post')
Y_train = pad_sequences(Y_train, maxlen=MAX_LENGTH, padding='post')
Y_test = pad_sequences(Y_test, maxlen=MAX_LENGTH, padding='post')
 
#print(X_train[0])
#print(X_test[0])
#print(Y_train[0])
#print(Y_test[0])

### Model Building using keras

In [40]:
#we will define the accuracy without consideration of padding beacause if not then accuracy will be high due to 
#prediction of padding tags

from keras import backend as K
 
def no_pad_accuracy(to_ignore=0):
    def ignore_accuracy(y_true, y_pred):
        y_true_class = K.argmax(y_true, axis=-1)
        y_pred_class = K.argmax(y_pred, axis=-1)
 
        ignore_mask = K.cast(K.not_equal(y_pred_class, to_ignore), 'int32')
        matches = K.cast(K.equal(y_true_class, y_pred_class), 'int32') * ignore_mask
        accuracy = K.sum(matches) / K.maximum(K.sum(ignore_mask), 1)
        return accuracy
    return ignore_accuracy

In [41]:
#Now we define the network architecture
#we will use the word embeddins for word to vector
#we have used Bidirectional LSTM, and model return a sequence

#first import library
from keras.models import Sequential
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation
from keras.optimizers import Adam

In [42]:
model = Sequential()
model.add(InputLayer(input_shape=(MAX_LENGTH, )))
model.add(Embedding(len(word2int), 128))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(len(tag2int))))
model.add(Activation('softmax'))
 
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy',no_pad_accuracy(0)])
 
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 180, 128)          5856896   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 180, 512)          788480    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 180, 13)           6669      
_________________________________________________________________
activation_1 (Activation)    (None, 180, 13)           0         
Total params: 6,652,045
Trainable params: 6,652,045
Non-trainable params: 0
_________________________________________________________________


In [43]:
#Convert to one hot vector

def to_categorical(sequences, categories):
    cat_sequences = []
    for s in sequences:
        cats = []
        for item in s:
            cats.append(np.zeros(categories))
            cats[-1][item] = 1.0
        cat_sequences.append(cats)
    return np.array(cat_sequences)
 

In [44]:
cat_train_tags_y = to_categorical(Y_train, len(tag2int))


In [45]:
cat_train_tags_y[0]

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

### Model Training

In [46]:
model.fit(X_train, to_categorical(Y_train, len(tag2int)), batch_size=128, epochs=10, validation_split=0.2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f7bd8a046a0>

In [47]:
scores = model.evaluate(X_test, to_categorical(Y_test, len(tag2int)))
print(f"{model.metrics_names[2]}: {scores[2] * 100}")   

ignore_accuracy: 96.01406455039978


### Sample Example try

In [48]:
test_samples = [
    "we will be having a quiz on monday .".split(),
    "I am very big fan of lionel messi's playing style .".split()
]
print(test_samples)

[['we', 'will', 'be', 'having', 'a', 'quiz', 'on', 'monday', '.'], ['I', 'am', 'very', 'big', 'fan', 'of', 'lionel', "messi's", 'playing', 'style', '.']]


In [49]:
test_samples_X = []
for s in test_samples:
    s_int = []
    for w in s:
        try:
            s_int.append(word2int[w.lower()])
        except KeyError:
            s_int.append(word2int['-UNK-'])
    test_samples_X.append(s_int)
 
test_samples_X = pad_sequences(test_samples_X, maxlen=MAX_LENGTH, padding='post')
print(test_samples_X)

[[15100 13745 35534 23441 45678  6243 24283 11249  3941     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0 

In [50]:
def logits_to_tokens(sequences, index):
    token_sequences = []
    for categorical_sequence in sequences:
        token_sequence = []
        no_pad_tokens=[]
        for categorical in categorical_sequence:
            token_sequence.append(index[np.argmax(categorical)])

        token_sequences.append(token_sequence)
    return token_sequences

In [51]:
predictions = model.predict(test_samples_X)
tag_prediction = (logits_to_tokens(predictions, {i: t for t, i in tag2int.items()}))

In [1]:
#for converting output tag sequence to without "-PAD" tag, if no labels given
def pred_no_pad_without_labels(predictions):
  final_pred = []
  for tag_sent in predictions:
    tag_sent_no_pad = []
    for tags in tag_sent:
      if tags == "-PAD-":
        break
      else:
        tag_sent_no_pad.append(tags)
    final_pred.append(tag_sent_no_pad)
  return final_pred

In [53]:
final_pred = pred_no_pad_without_labels(tag_prediction)
print(final_pred)

[['PRON', 'VERB', 'VERB', 'VERB', 'DET', 'NOUN', 'ADP', 'NOUN', '.'], ['PRON', 'VERB', 'ADV', 'ADJ', 'NOUN', 'ADP', 'NOUN', 'NOUN', 'VERB', 'NOUN', '.']]


In [2]:
#for converting output tag sequence to without "-PAD" tag, if labels given
def pred_no_pad_with_labels(predictions,test_tags):
  final_pred = []
  for i in range(len(predictions)):
    k = min(len(test_tags[i]),len(predictions[i]))
    tag_sent_no_pad = []
    for j in range(k):
      tag_sent_no_pad.append(predictions[i][j])
    final_pred.append(tag_sent_no_pad)
  return final_pred

### Prediction on test data

In [55]:
test_pred = model.predict(X_test)
test_tag_pred = logits_to_tokens(test_pred, {i: t for t, i in tag2int.items()})
test_tag_pred_no_pad = pred_no_pad_with_labels(test_tag_pred,test_tags)

In [56]:
#for counting if the length of test tag sequneces and predicted tag sequences are equall or not
t=0
for i in range(len(test_tags)):
  for j in range(len(test_tag_pred_no_pad)):
    if i==j:
      if len(test_tags[i])!=len(test_tag_pred_no_pad[i]):
        print(i)
        t+=1
print(t)

0


In [57]:
d = {"true_labels": test_tags, "predicted_labels":test_tag_pred_no_pad}
pd_pred = pd.DataFrame(data=d)

In [59]:
#for confusion matrix we make a one list for true labels and predicted labels each, 
test_words = []
y_true=[]
y_pred=[]
for sent in test_tags:
  for tag in sent:
    y_true.append(tag)

for sent in test_tag_pred_no_pad:
  for tag in sent:
    y_pred.append(tag)

for sent in test_sentences:
  for word in sent:
    test_words.append(word)

In [60]:
d5th = {"words":test_words,"true_label":y_true,"predicted_lable":y_pred}
dataframe_5th = pd.DataFrame(data=d5th)

In [61]:
dataframe_5th.head()

Unnamed: 0,words,true_label,predicted_lable
0,The,DET,DET
1,quarrel,NOUN,NOUN
2,ended,VERB,VERB
3,in,ADP,ADP
4,a,DET,DET


In [62]:
from sklearn.metrics import classification_report
print('\nClassification Report\n')
print(classification_report(y_true, y_pred, target_names=list(unique_tags)+['-PAD-']))


Classification Report



  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           .       0.00      0.00      0.00         0
        CONJ       1.00      1.00      1.00     29083
         ADP       0.89      0.87      0.88      9855
         ADV       0.97      0.96      0.96     18434
         PRT       0.92      0.90      0.91     10299
           X       0.99      1.00      0.99      5974
         NUM       0.99      0.98      0.99     20030
         ADJ       0.93      0.95      0.94     34030
        PRON       0.91      0.97      0.94      1072
         DET       0.98      0.99      0.98     13784
        VERB       0.92      0.90      0.91      6513
        NOUN       0.96      0.96      0.96     32291
       -PAD-       0.48      0.12      0.19       181

    accuracy                           0.96    181546
   macro avg       0.84      0.82      0.82    181546
weighted avg       0.96      0.96      0.96    181546



In [63]:
from sklearn.metrics import confusion_matrix
confusion = confusion_matrix(y_true, y_pred,labels=list(unique_tags)+['-PAD-'])
print('Confusion Matrix\n')
print(confusion)

Confusion Matrix

[[29083     0     0     0     0     0     0     0     0     0     0     0
      0]
 [    0  5949     1     1     0     0     0     1     0    15     2     5
      0]
 [    0    23 17642   210   413     0     0    26    39    31    27    23
      0]
 [    0    44   130  9300   103     1     0   369     1    30    74   247
      0]
 [    1     0   334    87  5877     0     0     4     1     2    59   148
      0]
 [    0     0     1     1     1    22     0     3     0     0    26   127
      0]
 [    0     0     1     0     0     0  1040     0     0     0     2    29
      0]
 [    0     0     8   349     7     2     0  8580     0     0   155   754
      0]
 [    0     0    15     3     7     0     0     0 13600   150     1     8
      0]
 [    0     4   102    53     0     0     2     2   196 19665     1     5
      0]
 [    0     0    23    41     5     0     0   132     0     0 31121   969
      0]
 [    0     0    13    68     6    21   101   569     2     1   907 3

In [64]:
pd_confusion = pd.DataFrame((confusion), columns=list(unique_tags)+['-PAD-'],index=list(unique_tags)+['-PAD-'])