<a href="https://colab.research.google.com/github/Mizanur4E/Text-normalizer-formatter-CobaltSpeech/blob/main/Demo_CoW_21_3_Statistical_Formatter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Attach Drive 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Load IOB tagger model and Tokenizers

Imporiting word2idx and tag2idx

In [None]:
import pickle
x = open('/content/drive/My Drive/word2idx.pkl','rb')
word2idx= pickle.load(x)
x = open('/content/drive/My Drive/tag2idx.pkl','rb')
tag2idx= pickle.load(x)
idx2tag = {key : value for (value,key) in tag2idx.items()}

Before loading weight, Define the model

In [None]:
import tensorflow as tf
import numpy as np
import random as python_random
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense
from tensorflow.keras.layers import TimeDistributed, SpatialDropout1D, Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.models import model_from_json

In [None]:
max_len =200 #assumed 200. max number of character in a sentences
num_words = len(word2idx) #max number of differnt words
num_tags = len(tag2idx)

input_word = Input(shape=(max_len,))
model = Embedding(input_dim=num_words, output_dim=50, input_length=max_len)(input_word)
model = SpatialDropout1D(0.1)(model)
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(num_tags, activation="softmax"))(model)
model = Model(input_word, out)
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 200)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 200, 50)           4301350   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 200, 50)           0         
_________________________________________________________________
bidirectional (Bidirectional (None, 200, 200)          120800    
_________________________________________________________________
time_distributed (TimeDistri (None, 200, 3)            603       
Total params: 4,422,753
Trainable params: 4,422,753
Non-trainable params: 0
_________________________________________________________________


Loading the weights into the model

In [None]:
best_model_path="/content/drive/My Drive/weights.best.hdf5"
model.load_weights(best_model_path)

### Load formatter model and setting up for inferencing

In [None]:
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy



def formatter(X):

    
  with open('/content/drive/My Drive/tokenizer1V2.txt') as f:
    lines1= f.read()

  with open('/content/drive/My Drive/tokenizer2V2.txt') as f:
    lines2= f.read()
    
  tokenizer1 = tokenizer_from_json(lines1)
  tokenizer2 = tokenizer_from_json(lines2)

  
  gen = tf.keras.models.load_model('/content/drive/My Drive/modelYX_V2')

  speech = ''
  for word in X:
    speech +=word+' '
 
  speech = [speech]

  maxlen2 = 9
  
  inp= tokenizer2.texts_to_sequences(speech)
  inp =pad_sequences(inp, padding='post', maxlen=maxlen2)
  out = gen.predict(inp)
  out = numpy.argmax(out,axis=-1)
  text_out= tokenizer1.sequences_to_texts(out)
  

  return text_out

### Pipeline to connect two models for single line conversion

In [None]:
def line_formatter(s):
  
  s=s.upper()             #upeercases all the words to match dictionary keys
  s= s.split(' ')         #convert to list 



  seq = []                #converts the word of s into sequence using word2idx
  for word in s:
    seq.append(word2idx.get(word, 0))

  
  if seq == [None]:
    return []

  
  seq= [seq]
  

  padded = pad_sequences(sequences=seq, maxlen=max_len,padding='post', value= num_words-1)
  p = model.predict(np.array(padded))
  y_pred = np.argmax(p, axis=-1)

  y_tag = []


  for sen in y_pred:
      for i in sen:
            y_tag.append(idx2tag.get(i))


  #align the commented line and write formatter function that
  #predicts and return formatted text of given speech

  formatted_line = []
  in_formatter = 0
  speech = []
  L = len(s)+1
  for i in range(L):

    

    if y_tag[i] == 'O':

      if in_formatter == 1:
        #print(speech)
        #predict formatted text and join
        formatted = formatter (speech)
        formatted = formatted[0]
        formatted_line.append(formatted)

      if i == len(s):
        continue
      in_formatter = 0
      formatted_line.append(s[i])


    else:

      if in_formatter == 0 :
        
        speech = []
        speech.append(s[i])  
        in_formatter = 1   

      else:  #align these lines 
        if y_tag[i] == 'B':
          formatted = formatter (speech)
          formatted = formatted[0]
          formatted_line.append(formatted)
          #print(formatted,'here')
          speech = []
          speech.append(s[i])  

        else:
          speech.append(s[i])

  tmp= ''
  for word in s:
    tmp +=word+' '
  s=tmp.strip()

  print("{:20}".format("Spoken Form:"),s)
  print("{:20}".format("IOB tagger Output:"),y_tag[:28])
  
  tmp= ''
  for word in formatted_line:
    tmp +=word+' '
  formatted_line=tmp.strip()
    
  print("{:20}".format("Formatted Form:"),formatted_line)
  print('\n')


### Read a input text file and convert line by line

In [None]:
with open('/content/drive/My Drive/demo_sample.txt','r') as f:
  lines = f.readlines()

  
import random
import re
#random.shuffle(lines)
for line in lines:
  line= line.strip()
  line= re.sub('None','',line)
  line_formatter(line)

Spoken Form:         FOR THEM I'VE ADJUSTED WELL OVER TWO THOUSAND CLAIMS FOR THEM AND I KNOW YOU'VE SAID THAT
IOB tagger Output:   ['O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Formatted Form:      FOR THEM I'VE ADJUSTED WELL OVER 2000 CLAIMS FOR THEM AND I KNOW YOU'VE SAID THAT


Spoken Form:         JANUARY TWENTY FIRST TWO THOUSAND TWENTY CORRECT THAT'S WHEN I RECEIVED THE DOCUMENTS
IOB tagger Output:   ['B', 'I', 'I', 'B', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Formatted Form:      january 21 2002 CORRECT THAT'S WHEN I RECEIVED THE DOCUMENTS


Spoken Form:         CAN YOU TELL ME WHAT ROOM THAT IS ONE HUNDRED FOUR LOOKS LIKE MY SON'S BEDROOM
IOB tagger Output:   ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Formatt