Ref https://github.com/AiswaryaSrinivas/DataScienceWithPython

In [1]:
import nltk, re, pprint
import numpy as np
import pandas as pd
import requests
import matplotlib.pyplot as plt
import seaborn as sns
import pprint, time
import random
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from sklearn_crfsuite import CRF
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers
from collections import Counter

In [2]:
nltk.download('treebank')
nltk.download('universal_tagset')

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [3]:
tagged_sentence = nltk.corpus.treebank.tagged_sents(tagset='universal')
tagged_sentence

[[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')], [('Mr.', 'NOUN'), ('Vinken', 'NOUN'), ('is', 'VERB'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Elsevier', 'NOUN'), ('N.V.', 'NOUN'), (',', '.'), ('the', 'DET'), ('Dutch', 'NOUN'), ('publishing', 'VERB'), ('group', 'NOUN'), ('.', '.')], ...]

In [4]:
print("Number of Tagged Sentences", len(tagged_sentence))
tagged_words = [tup for sent in tagged_sentence for tup in sent]
print("Total Number of Tagged Words", len(tagged_words))
vocab = set([word for word, tag in tagged_words])
print("Vocabulary of the Corpus", len(vocab))
tags = set([tag for word, tag in tagged_words])
print("Number of Tags in the Corpus", len(tags))

Number of Tagged Sentences 3914
Total Number of Tagged Words 100676
Vocabulary of the Corpus 12408
Number of Tags in the Corpus 12


In [5]:
train_set, test_set = train_test_split(tagged_sentence, test_size=0.2, random_state=1234)
print("Number of Sentences in Training Data ", len(train_set))
print("Number of Sentences in Testing Data ", len(test_set))

Number of Sentences in Training Data  3131
Number of Sentences in Testing Data  783


In [6]:
def features(sentence, index):
  return {
      'is_first_capital' : int(sentence[index][0].isupper()),
      'is_first_word' : int(index==0),
      'is_last_word' : int(index==len(sentence)-1),
      'is_complete_capital' : int(sentence[index].upper()==sentence[index]),
      'prev_word' : '' if index==0 else sentence[index-1],
      'next_word' : '' if index==len(sentence)-1 else sentence[index+1],
      'is_numeric' : int(sentence[index].isdigit()),
      'is_alphanumeric' : int(bool((re.match("^(?=.*[0-9]$)(?=.*[a-zA-Z])", sentence[index])))),
      'prefix_1' : sentence[index][0],
      'prefix_2' : sentence[index][:2],
      'prefix_3' : sentence[index][:3],
      'prefix_4' : sentence[index][:4],
      'suffix_1' : sentence[index][-1],
      'suffix_2' : sentence[index][-2:],
      'suffix_3' : sentence[index][-3:],
      'suffix_4' : sentence[index][-4:],
      'word_has_hyphen' : 1 if '-' in sentence[index] else 0
  }

In [7]:
def untag(sentence):
    return  [word for word, tag in sentence]

def prepareData(tagged_sentences):
    X, y = [], []
    for sentences in tagged_sentences:
        X.append([features(untag(sentences), index) for index in range(len(sentences))])
        y.append([tag for word, tag in sentences])
    return X, y

In [8]:
X_train, y_train = prepareData(train_set)
X_test, y_test = prepareData(test_set)
X_train[0]

[{'is_first_capital': 1,
  'is_first_word': 1,
  'is_last_word': 0,
  'is_complete_capital': 0,
  'prev_word': '',
  'next_word': 'Wall',
  'is_numeric': 0,
  'is_alphanumeric': 0,
  'prefix_1': 'O',
  'prefix_2': 'On',
  'prefix_3': 'On',
  'prefix_4': 'On',
  'suffix_1': 'n',
  'suffix_2': 'On',
  'suffix_3': 'On',
  'suffix_4': 'On',
  'word_has_hyphen': 0},
 {'is_first_capital': 1,
  'is_first_word': 0,
  'is_last_word': 0,
  'is_complete_capital': 0,
  'prev_word': 'On',
  'next_word': 'Street',
  'is_numeric': 0,
  'is_alphanumeric': 0,
  'prefix_1': 'W',
  'prefix_2': 'Wa',
  'prefix_3': 'Wal',
  'prefix_4': 'Wall',
  'suffix_1': 'l',
  'suffix_2': 'll',
  'suffix_3': 'all',
  'suffix_4': 'Wall',
  'word_has_hyphen': 0},
 {'is_first_capital': 1,
  'is_first_word': 0,
  'is_last_word': 0,
  'is_complete_capital': 0,
  'prev_word': 'Wall',
  'next_word': 'men',
  'is_numeric': 0,
  'is_alphanumeric': 0,
  'prefix_1': 'S',
  'prefix_2': 'St',
  'prefix_3': 'Str',
  'prefix_4': 'Str

In [9]:
y_train[0]

['ADP',
 'NOUN',
 'NOUN',
 'NOUN',
 'CONJ',
 'NOUN',
 'VERB',
 'ADP',
 'ADJ',
 'NOUN',
 '.',
 'X',
 'VERB',
 'NUM',
 'DET',
 'ADV',
 'ADV',
 'PRON',
 'VERB',
 'ADP',
 'NOUN',
 'X',
 '.']

In [10]:
crf = CRF(
    algorithm='lbfgs',
    c1=0.01,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
try:
    crf.fit(X_train, y_train)
except AttributeError:
    pass
predictions = crf.predict(X_test)

In [11]:
metrics.flat_f1_score(y_test, predictions, average='weighted', labels=crf.classes_)

0.9738471726864286

In [12]:
predictions_train = crf.predict(X_train)
metrics.flat_f1_score(y_train, predictions_train, average='weighted', labels=crf.classes_)

0.9963402924209424

Ref https://nlpforhackers.io/lstm-pos-tagger-keras/

In [13]:
import nltk
nltk.download('treebank')
tagged_sentences = nltk.corpus.treebank.tagged_sents()

print(tagged_sentences[0])
print("Tagged sentences: ", len(tagged_sentences))
print("Tagged words: ", len(nltk.corpus.treebank.tagged_words()))

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!


[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]
Tagged sentences:  3914
Tagged words:  100676


In [14]:
import numpy as np

In [15]:
sentences, sentence_tags = [], []
for tagged_sentence in tagged_sentences:
    sentence, tags = zip(*tagged_sentence)
    sentences.append(np.array(sentence))
    sentence_tags.append(np.array(tags))

# Let's see how a sequence looks

print(sentences[5])
print(sentence_tags[5])

['Lorillard' 'Inc.' ',' 'the' 'unit' 'of' 'New' 'York-based' 'Loews'
 'Corp.' 'that' '*T*-2' 'makes' 'Kent' 'cigarettes' ',' 'stopped' 'using'
 'crocidolite' 'in' 'its' 'Micronite' 'cigarette' 'filters' 'in' '1956'
 '.']
['NNP' 'NNP' ',' 'DT' 'NN' 'IN' 'JJ' 'JJ' 'NNP' 'NNP' 'WDT' '-NONE-' 'VBZ'
 'NNP' 'NNS' ',' 'VBD' 'VBG' 'NN' 'IN' 'PRP$' 'NN' 'NN' 'NNS' 'IN' 'CD'
 '.']


In [16]:
from sklearn.model_selection import train_test_split

(train_sentences,
test_sentences,
train_tags,
test_tags) = train_test_split(sentences, sentence_tags, test_size=0.2)

Keras also needs to work with numbers,

In [17]:
words, tags = set([]), set([])

for s in train_sentences:
    for w in s:
        words.add(w.lower())
    
for ts in train_tags:
    for t in ts:
        tags.add(t)

word2index = {w: i + 2 for i, w in enumerate(list(words))}
word2index['-PAD-'] = 0
word2index['-OOV-'] = 1

tag2index = {t: i + 1 for i, t in enumerate(list(tags))}
tag2index['-PAD-'] = 0

In [18]:
train_sentences_X, test_sentences_X, train_tags_y, test_tags_y = [], [], [], []

for s in train_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
    
    train_sentences_X.append(s_int)

for s in test_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
    
    test_sentences_X.append(s_int)

for s in train_tags:
    train_tags_y.append([tag2index[t] for t in s])

for s in test_tags:
    test_tags_y.append([tag2index[t] for t in s])

print(train_sentences_X[0])
print(test_sentences_X[0])
print(train_tags_y[0])
print(test_tags_y[0])

[969, 2365, 3824, 1989, 4376, 1673, 2784, 6893, 1943, 4376, 2432, 3300, 4244, 2712, 3438, 1032, 7228, 147, 4138, 6805, 9700, 3263, 3300, 1771, 5768, 4673, 6824, 644, 1964, 9837, 5768, 9391, 5888, 6272, 6721, 4140, 228, 858, 9636, 2628, 1976]
[8682, 5171, 8843, 342, 9776, 9411, 2550, 3438, 107, 6272, 4376, 7468, 4760, 1, 9967, 3300, 4376, 1346, 8755, 4123, 1976]
[21, 34, 23, 32, 42, 39, 11, 28, 1, 42, 33, 29, 17, 41, 30, 42, 40, 41, 5, 34, 3, 32, 29, 39, 12, 18, 11, 32, 33, 33, 12, 39, 39, 5, 4, 20, 5, 33, 33, 33, 8]
[33, 33, 33, 24, 17, 28, 32, 30, 20, 5, 42, 4, 39, 28, 32, 29, 42, 39, 46, 20, 8]


In [19]:
MAX_LEGTH = len(max(train_sentences_X, key=len))
print(MAX_LEGTH)

271


In [20]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

train_sentences_X = pad_sequences(train_sentences_X, maxlen=MAX_LEGTH, padding='post')
test_sentences_X = pad_sequences(test_sentences_X, maxlen=MAX_LEGTH, padding='post')
train_tags_y = pad_sequences(train_tags_y, maxlen=MAX_LEGTH, padding='post')
test_tags_y = pad_sequences(test_tags_y, maxlen=MAX_LEGTH, padding='post')

print(train_sentences_X[0])
print(test_sentences_X[0])
print(train_tags_y[0])
print(test_tags_y[0])

[ 969 2365 3824 1989 4376 1673 2784 6893 1943 4376 2432 3300 4244 2712
 3438 1032 7228  147 4138 6805 9700 3263 3300 1771 5768 4673 6824  644
 1964 9837 5768 9391 5888 6272 6721 4140  228  858 9636 2628 1976    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

In [21]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation
from keras.optimizers import adam_v2

model = Sequential()
model.add(InputLayer(input_shape=(MAX_LEGTH, )))
model.add(Embedding(len(word2index), 128))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(len(tag2index))))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer=adam_v2.Adam(0.001), metrics=['accuracy'])

model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 271, 128)          1298176   
                                                                 
 bidirectional (Bidirectiona  (None, 271, 512)         788480    
 l)                                                              
                                                                 
 time_distributed (TimeDistr  (None, 271, 47)          24111     
 ibuted)                                                         
                                                                 
 activation (Activation)     (None, 271, 47)           0         
                                                                 
Total params: 2,110,767
Trainable params: 2,110,767
Non-trainable params: 0
_________________________________________________________________


In [22]:
def to_categorical(sequences, categories):
    cat_sequences = []
    for s in sequences:
        cats = []
        for item in s:
            cats.append(np.zeros(categories))
            cats[-1][item] = 1.0
        cat_sequences.append(cats)
    return np.array(cat_sequences)

In [23]:
cat_train_tags_y = to_categorical(train_tags_y, len(tag2index))

In [24]:
model.fit(train_sentences_X, to_categorical(train_tags_y, len(tag2index)), batch_size=128, epochs=40, validation_split=0.2)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x1828efc2590>

In [25]:
scores = model.evaluate(test_sentences_X, to_categorical(test_tags_y, len(tag2index)))
print(f"{model.metrics_names[1]} : {scores[1] == 100}")

accuracy : False
