In [1]:
import tensorflow as tf
print(tf.__version__)

2.5.0


In [2]:
# save_path = '/content/drive/MyDrive/Dl projects/RNN basics/parts of speech'
model_name = 'rnn_pos_model'
# model_path = save_path + '/' + model_name

from sklearn.metrics import confusion_matrix
import numpy as np

def balanced_accuracy(y_true, y_pred):
    y_true = y_true.numpy()
    y_pred = y_pred.numpy()
    y_pred = np.argmax(y_pred,axis=-1)

    y_true = y_true.ravel()
    y_pred = y_pred.ravel()

  #Here we will remove the padded zeroes since they will increase the balanced accuracy
  #You can also remove tokens corresponding to punctuation like ',' or '!' since there output is not useful
  
    y_true = np.trim_zeros(y_true, 'f')  
    y_pred = y_pred[(y_pred.shape[0] - y_true.shape[0]):]

    num_classes = len(np.unique(y_true))

    cm = confusion_matrix(y_true, y_pred).T
    balanced_accuracy = 0
    for i in range(num_classes):
        num = cm[i,i]
        den = np.sum(cm[:,i])
        if num == 0 :
            acc = 0
        else:
            acc = num / den
        balanced_accuracy += acc

    return (balanced_accuracy / num_classes)

  
model = tf.keras.models.load_model(model_name , custom_objects = {'balanced_accuracy' : balanced_accuracy})

In [3]:
# import os
# os.chdir(save_path)
# print(os.getcwd())

import joblib
tags_dict = joblib.load('tags.joblib')
word_tokenizer = joblib.load('tokenizer.joblib')

print('\n')
print(tags_dict)



{1: 'noun', 2: 'verb', 3: '.', 4: 'adp', 5: 'det', 6: 'adj', 7: 'adv', 8: 'pron', 9: 'conj', 10: 'prt', 11: 'num', 12: 'x', 0: 'padding'}


In [4]:
import re

def text_preprocess(text):
    text = re.sub('[^A-Za-z0-9]+', ' ', text)
    text = text.rstrip()
    text = text.lstrip()
    return text

def predict(text, max_len, tags_dict, word_tokenizer):

    original_text = text
    text = text_preprocess(text)
    text = text.split(' ')
    words = word_tokenizer.texts_to_sequences(text)
    words_dict = word_tokenizer.word_index
    words_dict = dict([(value, key) for key, value in words_dict.items()])
    words = np.array(words)
    words = words.ravel()

    words = tf.keras.preprocessing.sequence.pad_sequences(
        [words], maxlen = max_len, padding = 'pre',
        truncating='post', value=0.0 )
  
    words = np.array(words)
    original_words = words

    pred = model.predict(words)
    pred_prob = np.max(pred,axis=2)
    pred = np.argmax(pred,axis=2)

    pred = pred.ravel()
    pred = np.trim_zeros(pred, 'f')
    pred_prob = pred_prob.ravel()
    pred_prob = pred_prob[(pred_prob.shape[0] - pred.shape[0]):]
    original_words = original_words.ravel()
    original_words = np.trim_zeros(original_words, 'f')

    output = []
    for i in range(len(pred)):
        temp = []
        if words_dict[original_words[i]] == '<OOV>':
            temp.append(text[i])
        else:
            temp.append(words_dict[original_words[i]])
        temp.append(tags_dict[pred[i]])
        temp.append(pred_prob[i])
        output.append(temp)

    return original_text, output

In [5]:
max_len = 125
text = 'my favourite movie is spiderman'

my_text, output = predict(text, max_len,tags_dict, word_tokenizer)

In [6]:
my_text

'my favourite movie is spiderman'

In [7]:
output

[['my', 'det', 0.81986964],
 ['favourite', 'noun', 0.8906837],
 ['movie', 'noun', 0.9999763],
 ['is', 'verb', 0.9998092],
 ['spiderman', 'noun', 0.64835703]]

In [8]:
from pywebio.input import input,TEXT
from pywebio.output import put_tabs,put_table

In [9]:
def predict_output():
    text = input('Parts Of Speech Tagger',placeholder = "Enter the text", type = TEXT)
    
    max_len = 125
    text, output = predict(text, max_len,tags_dict, word_tokenizer)
    
    put_tabs([
    {'title': 'Results', 'content': [
        put_table(output, header=['Words', 'Tag', 'Probability']) ]},
    {'title': 'Input text', 'content': text}
    ])

In [10]:
predict_output()