In [1]:
import nltk
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


In [2]:
nltk.download('conll2000')
train_sents = list(nltk.corpus.conll2000.iob_sents('train.txt'))
test_sents = list(nltk.corpus.conll2000.iob_sents('test.txt'))

[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!


In [3]:
# nltk.download('brown')
# nltk.download('universal_tagset')

In [4]:
print(train_sents[0])

[('Confidence', 'NN', 'B-NP'), ('in', 'IN', 'B-PP'), ('the', 'DT', 'B-NP'), ('pound', 'NN', 'I-NP'), ('is', 'VBZ', 'B-VP'), ('widely', 'RB', 'I-VP'), ('expected', 'VBN', 'I-VP'), ('to', 'TO', 'I-VP'), ('take', 'VB', 'I-VP'), ('another', 'DT', 'B-NP'), ('sharp', 'JJ', 'I-NP'), ('dive', 'NN', 'I-NP'), ('if', 'IN', 'B-SBAR'), ('trade', 'NN', 'B-NP'), ('figures', 'NNS', 'I-NP'), ('for', 'IN', 'B-PP'), ('September', 'NNP', 'B-NP'), (',', ',', 'O'), ('due', 'JJ', 'B-ADJP'), ('for', 'IN', 'B-PP'), ('release', 'NN', 'B-NP'), ('tomorrow', 'NN', 'B-NP'), (',', ',', 'O'), ('fail', 'VB', 'B-VP'), ('to', 'TO', 'I-VP'), ('show', 'VB', 'I-VP'), ('a', 'DT', 'B-NP'), ('substantial', 'JJ', 'I-NP'), ('improvement', 'NN', 'I-NP'), ('from', 'IN', 'B-PP'), ('July', 'NNP', 'B-NP'), ('and', 'CC', 'I-NP'), ('August', 'NNP', 'I-NP'), ("'s", 'POS', 'B-NP'), ('near-record', 'JJ', 'I-NP'), ('deficits', 'NNS', 'I-NP'), ('.', '.', 'O')]


In [5]:
print("Total sentences with tags: ", len(train_sents))
print("total tagged words:", len(nltk.corpus.conll2000.tagged_words()))


Total sentences with tags:  8936
total tagged words: 259104


#### Pre processing

In [6]:
#for Training Data

In [7]:
train_sentences, train_sentence_pos, train_sentence_chunk =[], [],[] 
for tagged_sentence in train_sents:
    sentence, pos_tags, chunks = zip(*tagged_sentence)
    train_sentences.append(np.array(sentence))
    train_sentence_pos.append(np.array(pos_tags))
    train_sentence_chunk.append(np.array(chunks))

In [8]:
for i in range(len(train_sentence_chunk)):
    for j in range(len(train_sentence_chunk[i])):
        train_sentence_chunk[i][j] = train_sentence_chunk[i][j][0]
train_sentence_chunk[4]

array(['O', 'B', 'I', 'B', 'B', 'B', 'B', 'I', 'I', 'I', 'B', 'B', 'I',
       'B', 'B', 'I', 'I', 'O', 'O', 'B', 'B', 'I', 'O', 'B', 'I', 'I',
       'B', 'B', 'I', 'I', 'O'], dtype='<U6')

for Test Data

In [9]:
test_sentences, test_sentence_pos, test_sentence_chunk =[], [],[] 
for tagged_sentence in test_sents:
    sentence, pos_tags, chunks = zip(*tagged_sentence)
    test_sentences.append(np.array(sentence))
    test_sentence_pos.append(np.array(pos_tags))
    test_sentence_chunk.append(np.array(chunks))

In [10]:
for i in range(len(test_sentence_chunk)):
    for j in range(len(test_sentence_chunk[i])):
        test_sentence_chunk[i][j] = test_sentence_chunk[i][j][0]
test_sentence_chunk[0]

array(['B', 'I', 'I', 'B', 'I', 'I', 'B', 'B', 'B', 'B', 'I', 'I', 'B',
       'B', 'I', 'B', 'B', 'I', 'B', 'I', 'B', 'I', 'B', 'B', 'B', 'I',
       'I', 'O'], dtype='<U4')

In [11]:
# num_sents = len(sentences)
# k = 5
# foldsize = int(num_sents/5)


# k_folds = {}
# for i in range(5):
#     # Locate the test set in the fold.
#     k_folds["test_sent{0}".format(i)] = sentences[i*foldsize:i*foldsize+foldsize]
#     k_folds["train_sent{0}".format(i)] = sentences[:i*foldsize] + sentences[i*foldsize+foldsize:]
#     k_folds["test_tags{0}".format(i)] = sentence_tags[i*foldsize:i*foldsize+foldsize]
#     k_folds["train_tags{0}".format(i)] = sentence_tags[:i*foldsize] + sentence_tags[i*foldsize+foldsize:]

In [12]:
training_sentences, test_sentences, training_chunk, test_chunk = train_sentences, test_sentences, train_sentence_chunk, test_sentence_chunk

In [13]:
vocab = set([])
unique_tags = set([])
for sent in training_sentences:
    for word in sent:
        vocab.add(word.lower())
for sent_chunk in training_chunk:
    for chunk in sent_chunk:
        unique_tags.add(chunk)

In [14]:
print(len(unique_tags),len(vocab))


3 17258


In [15]:
#Converting words to integer and adding pad and unkonwn words as integer as 0 and 1
word2int = {word: i + 2 for i, word in enumerate(list(vocab))}
word2int['-PAD-'] = 0
word2int['-UNK-'] = 1 
 
tag2int = {tag: i + 1 for i, tag in enumerate(list(unique_tags))}
tag2int['-PAD-'] = 0  

In [16]:
word2int['were'], tag2int["B"]

(4130, 1)

In [17]:
#Convert test and trainig dataset to integer
X_train, X_test, Y_train, Y_test = [],[],[],[]
for sent in training_sentences:
    sent_to_int = []
    for word in sent:
        try:
            sent_to_int.append(word2int[word.lower()])
        except KeyError:
            sent_to_int.append(word2int['-UNK-'])
    X_train.append(sent_to_int)

for sent in test_sentences:
    sent_to_int = []
    for word in sent:
        try:
            sent_to_int.append(word2int[word.lower()])
        except KeyError:
            sent_to_int.append(word2int['-UNK-'])
    X_test.append(sent_to_int)
    
    
for sent_tag in training_chunk:
    Y_train.append([tag2int[tag] for tag in sent_tag])
    
for sent_tag in test_chunk:
    Y_test.append([tag2int[tag] for tag in sent_tag])

In [18]:
print(X_train[0],"\n",Y_train[0])
print(X_test[0],"\n",Y_test[0])

[12531, 2440, 15620, 11362, 9299, 2696, 3499, 2835, 4215, 5738, 3859, 6825, 16725, 5326, 4621, 14910, 2846, 4, 10537, 14910, 6935, 3322, 4, 12348, 2835, 14841, 13476, 10044, 11211, 5911, 15977, 10451, 3213, 1039, 4820, 9454, 16709] 
 [1, 1, 1, 3, 1, 3, 3, 3, 3, 1, 3, 3, 1, 1, 3, 1, 1, 2, 1, 1, 1, 1, 2, 1, 3, 3, 1, 3, 3, 1, 1, 3, 3, 1, 3, 3, 2]
[1, 16298, 12660, 1039, 6349, 13757, 3335, 11816, 6234, 13476, 13837, 9557, 1, 10266, 15358, 8389, 2706, 11177, 2835, 12890, 3448, 14345, 14910, 2706, 1039, 15018, 1, 16709] 
 [1, 3, 3, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 3, 1, 1, 3, 1, 3, 1, 3, 1, 1, 1, 3, 3, 2]


In [19]:
#Now we add padding to every sentence for equalling the length to max length of sentences
MAX_LENGTH = len(max(X_train, key=len))
print(MAX_LENGTH) 


78


In [20]:
from keras.preprocessing.sequence import pad_sequences

X_train = pad_sequences(X_train, maxlen=MAX_LENGTH, padding='post')
X_test = pad_sequences(X_test, maxlen=MAX_LENGTH, padding='post')
Y_train = pad_sequences(Y_train, maxlen=MAX_LENGTH, padding='post')
Y_test = pad_sequences(Y_test, maxlen=MAX_LENGTH, padding='post')
 
#print(X_train[0])
#print(X_test[0])
#print(Y_train[0])
#print(Y_test[0])

### Model Buildign using keras

In [21]:
#we will define the accuracy without consideration of padding beacause if not then accuracy will be high due to 
#prediction of padding tags

from keras import backend as K
 
def no_pad_accuracy(to_ignore=0):
    def ignore_accuracy(y_true, y_pred):
        y_true_class = K.argmax(y_true, axis=-1)
        y_pred_class = K.argmax(y_pred, axis=-1)
 
        ignore_mask = K.cast(K.not_equal(y_pred_class, to_ignore), 'int32')
        matches = K.cast(K.equal(y_true_class, y_pred_class), 'int32') * ignore_mask
        accuracy = K.sum(matches) / K.maximum(K.sum(ignore_mask), 1)
        return accuracy
    return ignore_accuracy

In [22]:
#Now we define the network architecture
#we will use the word embeddins for word to vector
#we have used Bidirectional LSTM, and model return a sequence

#first import library
from keras.models import Sequential
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation
from keras.optimizers import Adam

In [23]:
model = Sequential()
model.add(InputLayer(input_shape=(MAX_LENGTH, )))
model.add(Embedding(len(word2int), 128))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(len(tag2int))))
model.add(Activation('softmax'))
 
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy',no_pad_accuracy(0)])
 
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 78, 128)           2209280   
_________________________________________________________________
bidirectional (Bidirectional (None, 78, 512)           788480    
_________________________________________________________________
time_distributed (TimeDistri (None, 78, 4)             2052      
_________________________________________________________________
activation (Activation)      (None, 78, 4)             0         
Total params: 2,999,812
Trainable params: 2,999,812
Non-trainable params: 0
_________________________________________________________________


In [24]:
#Convert to one hot vector

def to_categorical(sequences, categories):
    cat_sequences = []
    for s in sequences:
        cats = []
        for item in s:
            cats.append(np.zeros(categories))
            cats[-1][item] = 1.0
        cat_sequences.append(cats)
    return np.array(cat_sequences)
 

In [25]:
cat_train_tags_y = to_categorical(Y_train, len(tag2int))


In [26]:
# cat_train_tags_y[0]

### Model Training

In [27]:
model.fit(X_train, to_categorical(Y_train, len(tag2int)), batch_size=128, epochs=15, validation_split=0.2)


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f03f62c1550>

In [28]:
scores = model.evaluate(X_test, to_categorical(Y_test, len(tag2int)))
print(f"{model.metrics_names[2]}: {scores[2] * 100}")   

ignore_accuracy: 91.22939705848694


### Sample Example try

In [29]:
test_samples = [
    "we will be having a quiz on monday .".split(),
    "I am very big fan of lionel messi's playing style .".split()
]
print(test_samples)
 

 


[['we', 'will', 'be', 'having', 'a', 'quiz', 'on', 'monday', '.'], ['I', 'am', 'very', 'big', 'fan', 'of', 'lionel', "messi's", 'playing', 'style', '.']]


In [30]:
test_samples_X = []
for s in test_samples:
    s_int = []
    for w in s:
        try:
            s_int.append(word2int[w.lower()])
        except KeyError:
            s_int.append(word2int['-UNK-'])
    test_samples_X.append(s_int)
 
test_samples_X = pad_sequences(test_samples_X, maxlen=MAX_LENGTH, padding='post')
print(test_samples_X)

[[ 1237 10548  5828   536 13476 15229 11683  7350 16709     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0]
 [10853  3599  1723  3725  8297  1408     1     1  6095 12743 16709     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0]]


In [31]:
def logits_to_tokens(sequences, index):
    token_sequences = []
    for categorical_sequence in sequences:
        token_sequence = []
        no_pad_tokens=[]
        for categorical in categorical_sequence:
            token_sequence.append(index[np.argmax(categorical)])

        token_sequences.append(token_sequence)
    return token_sequences

In [32]:
predictions = model.predict(test_samples_X)
tag_prediction = (logits_to_tokens(predictions, {i: t for t, i in tag2int.items()}))

In [33]:
#for converting output tag sequence to without "-PAD" tag, if no labels given
def pred_no_pad_without_labels(predictions):
  final_pred = []
  for tag_sent in predictions:
    tag_sent_no_pad = []
    for tags in tag_sent:
      if tags == "-PAD-":
        break
      else:
        tag_sent_no_pad.append(tags)
    final_pred.append(tag_sent_no_pad)
  return final_pred

In [34]:
final_pred = pred_no_pad_without_labels(tag_prediction)
print(final_pred)


[['B', 'B', 'I', 'B', 'B', 'I', 'B', 'B', 'O'], ['B', 'B', 'B', 'I', 'I', 'B', 'B', 'B', 'B', 'I', 'O']]


In [35]:
#for converting output tag sequence to without "-PAD" tag, if labels given
def pred_no_pad_with_labels(predictions,test_tags):
  final_pred = []
  for i in range(len(predictions)):
    k = min(len(test_tags[i]),len(predictions[i]))
    tag_sent_no_pad = []
    for j in range(k):
      tag_sent_no_pad.append(predictions[i][j])
    final_pred.append(tag_sent_no_pad)
  return final_pred

### Prediction on test data

In [36]:
test_pred = model.predict(X_test)
test_tag_pred = logits_to_tokens(test_pred, {i: t for t, i in tag2int.items()})
test_tag_pred_no_pad = pred_no_pad_with_labels(test_tag_pred,test_chunk)

In [37]:
#for counting if the length of test tag sequneces and predicted tag sequences are equall or not
t=0
for i in range(len(test_chunk)):
  for j in range(len(test_tag_pred_no_pad)):
    if i==j:
      if len(test_chunk[i])!=len(test_tag_pred_no_pad[i]):
        print(i)
        t+=1
print(t)



0


In [38]:
d = {"true_labels": test_chunk, "predicted_labels":test_tag_pred_no_pad}
pd_pred = pd.DataFrame(data=d)

In [39]:
pd_pred.head()

Unnamed: 0,true_labels,predicted_labels
0,"[B, I, I, B, I, I, B, B, B, B, I, I, B, B, I, ...","[B, B, I, B, I, I, B, B, B, B, I, I, B, B, I, ..."
1,"[B, B, B, I, B, B, B, B, I, B, I, I, I, B, B, ...","[B, B, B, I, B, B, B, B, I, I, I, I, I, B, B, ..."
2,"[B, B, O, B, B, I, O, B, I, B, I, I, I, O, B, ...","[B, I, O, B, B, I, O, B, I, B, I, I, I, O, B, ..."
3,"[B, B, I, I, O, B, B, O, B, B, I, I, B, B, B, ...","[B, B, I, I, O, B, B, O, B, B, I, I, B, B, B, ..."
4,"[B, O, B, B, B, I, O, B, O, B, B, I, I, I, I, ...","[B, O, B, B, B, I, O, B, O, B, B, I, I, I, I, ..."


In [40]:
#for confusion matrix we make a one list for true labels and predicted labels each, 
test_words = []
y_true=[]
y_pred=[]
for sent in test_chunk:
  for tag in sent:
    y_true.append(tag)

for sent in test_tag_pred_no_pad:
  for tag in sent:
    y_pred.append(tag)

for sent in test_sentences:
  for word in sent:
    test_words.append(word)

In [41]:
d5th = {"words":test_words,"true_label":y_true,"predicted_lable":y_pred}
dataframe_5th = pd.DataFrame(data=d5th)


In [42]:
dataframe_5th.head()

Unnamed: 0,words,true_label,predicted_lable
0,Rockwell,B,B
1,International,I,B
2,Corp.,I,I
3,'s,B,B
4,Tulsa,I,I


In [43]:
from sklearn.metrics import classification_report
print('\nClassification Report\n')
print(classification_report(y_true, y_pred, target_names=['-PAD-']+list(unique_tags)))


Classification Report

              precision    recall  f1-score   support

       -PAD-       0.00      0.00      0.00         0
           B       0.91      0.94      0.92     23852
           O       0.91      0.85      0.88     17345
           I       0.93      0.96      0.94      6180

    accuracy                           0.91     47377
   macro avg       0.69      0.69      0.69     47377
weighted avg       0.91      0.91      0.91     47377



  _warn_prf(average, modifier, msg_start, len(result))


In [44]:
from sklearn.metrics import confusion_matrix
confusion = confusion_matrix(y_true, y_pred,labels=list(unique_tags)+['-PAD-'])
print('Confusion Matrix\n')
print(confusion)


Confusion Matrix

[[22485   102  1263     2]
 [  104  5917   159     0]
 [ 2179   344 14819     3]
 [    0     0     0     0]]


In [45]:
pd_confusion = pd.DataFrame((confusion), columns=list(unique_tags)+['-PAD-'],index=list(unique_tags)+['-PAD-'])

In [46]:
pd_confusion

Unnamed: 0,B,O,I,-PAD-
B,22485,102,1263,2
O,104,5917,159,0
I,2179,344,14819,3
-PAD-,0,0,0,0
