## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import pickle

import keras
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences, to_categorical
from keras.models import Model,Sequential
from keras.layers import Dense,SimpleRNN,Embedding,LSTM,GRU,Bidirectional,TimeDistributed,InputLayer
from sklearn.model_selection import train_test_split

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import brown,treebank

## Creating Dataset

In [2]:
treebank.tagged_sents(tagset="universal")[0]

[('Pierre', 'NOUN'),
 ('Vinken', 'NOUN'),
 (',', '.'),
 ('61', 'NUM'),
 ('years', 'NOUN'),
 ('old', 'ADJ'),
 (',', '.'),
 ('will', 'VERB'),
 ('join', 'VERB'),
 ('the', 'DET'),
 ('board', 'NOUN'),
 ('as', 'ADP'),
 ('a', 'DET'),
 ('nonexecutive', 'ADJ'),
 ('director', 'NOUN'),
 ('Nov.', 'NOUN'),
 ('29', 'NUM'),
 ('.', '.')]

In [3]:
d1 = treebank.tagged_sents(tagset="universal")

In [4]:
brown.tagged_sents(tagset="universal")[0]

[('The', 'DET'),
 ('Fulton', 'NOUN'),
 ('County', 'NOUN'),
 ('Grand', 'ADJ'),
 ('Jury', 'NOUN'),
 ('said', 'VERB'),
 ('Friday', 'NOUN'),
 ('an', 'DET'),
 ('investigation', 'NOUN'),
 ('of', 'ADP'),
 ("Atlanta's", 'NOUN'),
 ('recent', 'ADJ'),
 ('primary', 'NOUN'),
 ('election', 'NOUN'),
 ('produced', 'VERB'),
 ('``', '.'),
 ('no', 'DET'),
 ('evidence', 'NOUN'),
 ("''", '.'),
 ('that', 'ADP'),
 ('any', 'DET'),
 ('irregularities', 'NOUN'),
 ('took', 'VERB'),
 ('place', 'NOUN'),
 ('.', '.')]

In [5]:
d2 = brown.tagged_sents(tagset="universal")

In [6]:
dataset = d1+d2

In [7]:
dataset

[[('Pierre', 'NOUN'), ('Vinken', 'NOUN'), (',', '.'), ('61', 'NUM'), ('years', 'NOUN'), ('old', 'ADJ'), (',', '.'), ('will', 'VERB'), ('join', 'VERB'), ('the', 'DET'), ('board', 'NOUN'), ('as', 'ADP'), ('a', 'DET'), ('nonexecutive', 'ADJ'), ('director', 'NOUN'), ('Nov.', 'NOUN'), ('29', 'NUM'), ('.', '.')], [('Mr.', 'NOUN'), ('Vinken', 'NOUN'), ('is', 'VERB'), ('chairman', 'NOUN'), ('of', 'ADP'), ('Elsevier', 'NOUN'), ('N.V.', 'NOUN'), (',', '.'), ('the', 'DET'), ('Dutch', 'NOUN'), ('publishing', 'VERB'), ('group', 'NOUN'), ('.', '.')], ...]

In [8]:
len(dataset)

61254

In [9]:
x = []
y = []
for l in dataset:
    x1 = []
    y1 = []
    for t in l:
        x1.append(t[0])
        y1.append(t[1])
        
    x.append(x1)
    y.append(y1)

In [10]:
len(x)

61254

In [11]:
len(y)

61254

In [12]:
x #Input Variable

[['Pierre',
  'Vinken',
  ',',
  '61',
  'years',
  'old',
  ',',
  'will',
  'join',
  'the',
  'board',
  'as',
  'a',
  'nonexecutive',
  'director',
  'Nov.',
  '29',
  '.'],
 ['Mr.',
  'Vinken',
  'is',
  'chairman',
  'of',
  'Elsevier',
  'N.V.',
  ',',
  'the',
  'Dutch',
  'publishing',
  'group',
  '.'],
 ['Rudolph',
  'Agnew',
  ',',
  '55',
  'years',
  'old',
  'and',
  'former',
  'chairman',
  'of',
  'Consolidated',
  'Gold',
  'Fields',
  'PLC',
  ',',
  'was',
  'named',
  '*-1',
  'a',
  'nonexecutive',
  'director',
  'of',
  'this',
  'British',
  'industrial',
  'conglomerate',
  '.'],
 ['A',
  'form',
  'of',
  'asbestos',
  'once',
  'used',
  '*',
  '*',
  'to',
  'make',
  'Kent',
  'cigarette',
  'filters',
  'has',
  'caused',
  'a',
  'high',
  'percentage',
  'of',
  'cancer',
  'deaths',
  'among',
  'a',
  'group',
  'of',
  'workers',
  'exposed',
  '*',
  'to',
  'it',
  'more',
  'than',
  '30',
  'years',
  'ago',
  ',',
  'researchers',
  'reported'

In [13]:
y #Output Variable

[['NOUN',
  'NOUN',
  '.',
  'NUM',
  'NOUN',
  'ADJ',
  '.',
  'VERB',
  'VERB',
  'DET',
  'NOUN',
  'ADP',
  'DET',
  'ADJ',
  'NOUN',
  'NOUN',
  'NUM',
  '.'],
 ['NOUN',
  'NOUN',
  'VERB',
  'NOUN',
  'ADP',
  'NOUN',
  'NOUN',
  '.',
  'DET',
  'NOUN',
  'VERB',
  'NOUN',
  '.'],
 ['NOUN',
  'NOUN',
  '.',
  'NUM',
  'NOUN',
  'ADJ',
  'CONJ',
  'ADJ',
  'NOUN',
  'ADP',
  'NOUN',
  'NOUN',
  'NOUN',
  'NOUN',
  '.',
  'VERB',
  'VERB',
  'X',
  'DET',
  'ADJ',
  'NOUN',
  'ADP',
  'DET',
  'ADJ',
  'ADJ',
  'NOUN',
  '.'],
 ['DET',
  'NOUN',
  'ADP',
  'NOUN',
  'ADV',
  'VERB',
  'X',
  'X',
  'PRT',
  'VERB',
  'NOUN',
  'NOUN',
  'NOUN',
  'VERB',
  'VERB',
  'DET',
  'ADJ',
  'NOUN',
  'ADP',
  'NOUN',
  'NOUN',
  'ADP',
  'DET',
  'NOUN',
  'ADP',
  'NOUN',
  'VERB',
  'X',
  'PRT',
  'PRON',
  'ADV',
  'ADP',
  'NUM',
  'NOUN',
  'ADP',
  '.',
  'NOUN',
  'VERB',
  'X',
  'X',
  '.'],
 ['DET',
  'NOUN',
  'NOUN',
  '.',
  'NOUN',
  '.',
  'VERB',
  'ADV',
  'ADJ',
  'ADP'

## Checking Both xi and yi are in same length or not

In [14]:
c=0
for i in range(len(x)):
    if len(x[i])==len(y[i]):
        pass
    else:
        c+=1
c

0

## Finding vocabulary Size

In [15]:
voc = []
for q in x:
    for word in q:
        if word.lower() not in voc:
            voc.append(word.lower())

In [16]:
len(voc) #No of unique words in your courpus

53232

## Preprocessing and Tokenization

In [17]:
tk_x = Tokenizer(oov_token='UNK')
tk_x.fit_on_texts(x)

In [18]:
len(tk_x.word_index)

53233

In [19]:
tk_y = Tokenizer(oov_token='OOV')
tk_y.fit_on_texts(y)

In [20]:
len(tk_y.word_index)

13

In [21]:
tk_y.index_word

{1: 'OOV',
 2: 'noun',
 3: 'verb',
 4: '.',
 5: 'adp',
 6: 'det',
 7: 'adj',
 8: 'adv',
 9: 'pron',
 10: 'conj',
 11: 'prt',
 12: 'num',
 13: 'x'}

In [22]:
x_d = tk_x.texts_to_sequences(x)
y_d = tk_y.texts_to_sequences(y)

In [23]:
len(x_d)

61254

## Padding

In [24]:
final_x_d = pad_sequences(x_d,padding='post')
final_y_d = pad_sequences(y_d,padding='post')

In [25]:
final_x_d.shape

(61254, 271)

In [26]:
final_y_d.shape

(61254, 271)

In [27]:
final_y_d

array([[2, 2, 4, ..., 0, 0, 0],
       [2, 2, 3, ..., 0, 0, 0],
       [2, 2, 4, ..., 0, 0, 0],
       ...,
       [6, 2, 5, ..., 0, 0, 0],
       [9, 3, 6, ..., 0, 0, 0],
       [5, 6, 9, ..., 0, 0, 0]])

In [28]:
final_y = to_categorical(final_y_d)

In [29]:
final_y.shape

(61254, 271, 14)

## Model Building

In [30]:
model = Sequential()
model.add(InputLayer(input_shape=(271,)))
model.add(Embedding(53233+1,5))
model.add(Bidirectional(SimpleRNN(units=50,return_sequences=True)))
model.add(TimeDistributed(Dense(units=14,activation='softmax')))



In [31]:
model.summary()

In [32]:
model.compile(optimizer="adam",loss="categorical_crossentropy",metrics=["accuracy"])

In [33]:
model.fit(final_x_d,final_y,epochs=5,batch_size=120,validation_split=0.2)

Epoch 1/5
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m272s[0m 633ms/step - accuracy: 0.9134 - loss: 0.3610 - val_accuracy: 0.9830 - val_loss: 0.0595
Epoch 2/5
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m237s[0m 579ms/step - accuracy: 0.9814 - loss: 0.0654 - val_accuracy: 0.9924 - val_loss: 0.0269
Epoch 3/5
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m238s[0m 582ms/step - accuracy: 0.9913 - loss: 0.0324 - val_accuracy: 0.9957 - val_loss: 0.0156
Epoch 4/5
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m237s[0m 580ms/step - accuracy: 0.9951 - loss: 0.0181 - val_accuracy: 0.9967 - val_loss: 0.0117
Epoch 5/5
[1m409/409[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m238s[0m 581ms/step - accuracy: 0.9966 - loss: 0.0125 - val_accuracy: 0.9972 - val_loss: 0.0097


<keras.src.callbacks.history.History at 0x1bd6f7b2f40>

In [34]:
te = ["dhoni loves playing cricket and he is from ranchi"]

In [35]:
tk_x.texts_to_sequences(te)

[[1, 5515, 1117, 20352, 6, 14, 11, 30, 1]]

In [36]:
tk_x.sequences_to_texts(tk_x.texts_to_sequences(te))

['UNK loves playing cricket and he is from UNK']

In [37]:
test = pad_sequences(tk_x.texts_to_sequences(te),maxlen=271,padding='post')

In [38]:
np.argmax(model.predict(test)[0],axis=1)[np.argmax(model.predict(test)[0],axis=1)!=0]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step


array([ 2,  3,  3,  2, 10,  9,  3,  5,  7], dtype=int64)

In [39]:
tk_y.sequences_to_texts([[7, 2, 9, 3]])

['adj noun pron verb']

In [40]:
pickle.dump(tk_x,open(r"D:\streamlit\Deep Learning\POS Tag\tk_x.pkl",'wb'))

In [41]:
pickle.dump(tk_y,open(r"D:\streamlit\Deep Learning\POS Tag\tk_y.pkl",'wb'))

In [42]:
pickle.dump(model,open(r"D:\streamlit\Deep Learning\POS Tag\model.pkl",'wb'))

In [43]:
model.save(r"D:\streamlit\Deep Learning\POS Tag\model1.h5")

