# Import required libraries

In [43]:
from numpy import array
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, Embedding, LSTM, GRU
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
import time
from tqdm import *
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')

# Loading data from training file

In [44]:
data = pd.read_csv('./en_train.csv', nrows= 1000000)
check_results = data
data = data[['class','before', 'after']]
# Loading data in input and output variable from before and class columns
x_train = data['before']
y_train = data['class']

# Converting labels to One hot encoding

In [45]:
# Import values label encoder and fit output data
encoder = LabelEncoder()
encoder.fit(y_train)
# encode class values as integers
encoded_Y = encoder.transform(y_train)
# convert integers to dummy variables (i.e. one hot encoded)
labels_new = np_utils.to_categorical(encoded_Y)

# Converting values from training data to strings

In [46]:
n = len(x_train)
inp_data = [0]*n
for i in tqdm(range(n)):    
    inp_data[i] = [(str(x_train[i]))]
docs = inp_data

100%|██████████| 1000000/1000000 [00:23<00:00, 43111.46it/s]


# Tokenization and padding

In [47]:
# prepare tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1
# integer encode the documents
encoded_docs = t.texts_to_sequences(docs)

# pad documents to a max length of 2 words
max_length = 2
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')


# Loading data from Glove in-built dictionary

In [48]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('./glove.6B/glove.6B.100d.txt')
for line in f:
	values = line.split()
	word = values[0]
	coefs = asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


# Load vector for each word as embedding matrix

In [49]:
# create a weight matrix for words in training docs
count = 0
x_data = []
# Vectors are of size of 100
x_row = np.ones(100, dtype= float) * 0
embedding_matrix = zeros((vocab_size, 100))
wd = []

for word, i in t.word_index.items():
    # fetch the vector based on word given in get fucntion
    embedding_vector = embeddings_index.get(word)
    # If vector for given word exists then append it in embedding matrix
    if embedding_vector is not None:
        # Creating a list to track words that have embedding vectors
        wd.append(word)
        embedding_matrix[i] = embedding_vector
    else:
    # Create a list of words which don't have embeeding vectors
        wd.append(word)
        for xi, j in (zip(list(str(word)), np.arange(100))):
            x_row[j] = ord(xi)
        embedding_matrix[i] = x_row
        count += 1

In [243]:
# Count non-existing vectors
print(len(embedding_matrix)) # for 34,888 words it could not find corresponding vectors

97713


In [62]:
embedding_matrix[11]

array([-1.89700007e-01,  5.00239991e-02,  1.90840006e-01, -4.91839983e-02,
       -8.97369981e-02,  2.10060000e-01, -5.49520016e-01,  9.83769968e-02,
       -2.01350003e-01,  3.42409998e-01, -9.26769972e-02,  1.60999998e-01,
       -1.32679999e-01, -2.81599998e-01,  1.87370002e-01, -4.29589987e-01,
        9.60389972e-01,  1.39719993e-01, -1.07809997e+00,  4.05180007e-01,
        5.05389988e-01, -5.50639987e-01,  4.84400004e-01,  3.80439997e-01,
       -2.90549989e-03, -3.49420011e-01, -9.96960029e-02, -7.83680022e-01,
        1.03629994e+00, -2.31399998e-01, -4.71210003e-01,  5.71259975e-01,
       -2.14540005e-01,  3.59580010e-01, -4.83190000e-01,  1.08749998e+00,
        2.85239995e-01,  1.24470003e-01, -3.92480008e-02, -7.67320022e-02,
       -7.63429999e-01, -3.24090004e-01, -5.74899971e-01, -1.08930004e+00,
       -4.18110013e-01,  4.51200008e-01,  1.21119998e-01, -5.13670027e-01,
       -1.33489996e-01, -1.13779998e+00, -2.87680000e-01,  1.67740002e-01,
        5.58040023e-01,  

# Create a sequential model for embedding

In [20]:
# define model
model = Sequential()
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=2, trainable=True)
model.add(e)
#model.add(LSTM(64, dropout=0.2))
#model.add(LSTM(32, dropout = 0.2))
model.add(GRU(64, dropout=0.2))
#model.add(GRU(32, dropout = 0.2))
model.add(Dense(16, activation='softmax'))

In [21]:
from keras.callbacks import EarlyStopping
earlystop = EarlyStopping(monitor='val_acc', min_delta=0.001, patience=5,verbose=1, mode='auto')
callbacks_list = [earlystop]

# Training Model and predicting accuracy

In [24]:
# compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

# summarize the model
print(model.summary())
training_data = padded_docs[0:900000]
training_labels = labels_new[0:900000]

#training_data = training_data.reshape(-1, 1, 100)
#training_labels = training_labels.reshape(-1, 1, 16)
# fit the model Type 1
model.fit(training_data, training_labels, batch_size=128, verbose=1,epochs = 50, callbacks=callbacks_list, validation_split = 0.2)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 2, 100)            9771300   
_________________________________________________________________
gru_1 (GRU)                  (None, 64)                31680     
_________________________________________________________________
dense_3 (Dense)              (None, 16)                1040      
Total params: 9,804,020
Trainable params: 9,804,020
Non-trainable params: 0
_________________________________________________________________
None
Train on 720000 samples, validate on 180000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 00006: early stopping


<keras.callbacks.History at 0x7f03517e6a58>

# Testing model accuracy

In [25]:
# evaluate the model
testing_data = padded_docs[900000:len(x_train)]
testing_labels = labels_new[900000:len(y_train)]

loss, accuracy = model.evaluate(testing_data, testing_labels, verbose=1)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 97.812000


# Training using LSTM as first layer layers

In [27]:
from keras.callbacks import EarlyStopping

training_data = padded_docs[0:900000]
training_labels = labels_new[0:900000]

training_data = np.reshape(training_data, (900000, 1, 2))

testing_data = padded_docs[900000:len(x_train)]
testing_labels = labels_new[900000:len(y_train)]

testing_data = np.reshape(testing_data, (100000, 1, 2))

model_new = Sequential()
model_new.add(LSTM(100, dropout=0.0, recurrent_dropout=0.0,input_shape=(None, 2)))
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=2, trainable=True)
model.add(e)
model_new.add(Dense(32))
model_new.add(Dense(16, activation='softmax'))
model_new.compile(loss='mean_squared_error', optimizer='adam', metrics=['acc'])
monitor = EarlyStopping(monitor='val_acc', min_delta=1e-3, patience=5, verbose=1, mode='auto')
model_new.fit(training_data,training_labels,validation_data=(testing_data,testing_labels),callbacks=[monitor],verbose=1,epochs=10)

Train on 900000 samples, validate on 100000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 00007: early stopping


<keras.callbacks.History at 0x7f0349b02048>

# Testing model accuracy

In [28]:
loss, accuracy = model_new.evaluate(testing_data, testing_labels, verbose=1)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 89.036000


# Save Model

In [83]:
# Creates a HDF5 file 'my_model.h5'
model.save('model_LSTM_embeddings.h5')

In [11]:
from keras.models import load_model
# Returns a compiled model identical to the previous one
model = load_model('model_LSTM_embeddings.h5')
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 2, 100)            9771300   
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_1 (Dense)              (None, 16)                1616      
Total params: 9,853,316
Trainable params: 9,853,316
Non-trainable params: 0
_________________________________________________________________
None


# User input for sentence id

In [83]:
print("Please Enter one of the following id's: \n 1.1317 \n 2.78379 \n 3.75885 \n 4.28762 \n 5.6038 \n 6.11748 \n 7.9873 \n 8.10501 \n 9.11109 \n 10.3434")

sent_id = input("Enter sentense id:")
words = check_results.loc[check_results['sentence_id'] == int(sent_id)]
word = []
for i in words['before']:
    word.append(i)
print(word)

Please Enter one of the following id's: 
 1.1317 
 2.78379 
 3.75885 
 4.28762 
 5.6038 
 6.11748 
 7.9873 
 8.10501 
 9.11109 
 10.3434
Enter sentense id:75885
['Payoneer', 'was', 'founded', 'in', '2005', 'with', '$2 million', 'in', 'seed', 'funding', 'from', 'then', 'CEO', 'Yuval', 'Tal', 'and', 'other', 'private', 'investors', '.']


# Processing sentense and predicting class of each text

In [84]:
list_of_ids = words['before'].index
start_index = list_of_ids[0]
end_index = start_index + len(list_of_ids)
testing_data = padded_docs[0:len(x_train)]
result = model.predict(testing_data[start_index:end_index])

print(list(encoder.inverse_transform(np.argmax(result, axis=1))))
classes = list(encoder.inverse_transform(np.argmax(result, axis=1)))

['DATE', 'PLAIN', 'PLAIN', 'PLAIN', 'DATE', 'PLAIN', 'MONEY', 'PLAIN', 'PLAIN', 'PLAIN', 'PLAIN', 'PLAIN', 'LETTERS', 'PLAIN', 'PLAIN', 'PLAIN', 'PLAIN', 'PLAIN', 'PLAIN', 'PUNCT']


# Call regular expression file for corresponding classes

In [69]:
from RegEx_converter_v3 import cardinal, digit, letters, ordinal, address, telephone, electronic, fraction, money, date, plain, verbatim, time

In [85]:
output = []

for i in range(len(classes)):
    if(classes[i] == "CARDINAL"):
        output.append(cardinal(word[i]))
    elif(classes[i] == "DIGIT"):
        output.append(digit(word[i]))
    elif(classes[i] == "LETTERS"):
        output.append(letters(word[i]))
    elif(classes[i] == "ORDINAL"):
        output.append(ordinal(word[i]))
    elif(classes[i] == 'ADDRESS'):
        output.append(address(word[i]))
    elif(classes[i] == 'TELEPHONE'):
        output.append(telephone(word[i]))
    elif(classes[i] == 'ELECTRONIC'):
        output.append(electronic(word[i]))
    elif(classes[i] == 'FRACTION'):
        output.append(fraction(word[i]))
    elif(classes[i] == 'MONEY'):
        output.append(money(word[i])) 
    elif(classes[i] == 'PLAIN'):
        output.append(plain(word[i]))
    elif(classes[i] == 'DATE'):
        output.append(date(word[i]))
    elif(classes[i] == 'VERBATIM'):
        output.append(verbatim(word[i]))
    elif(classes[i] == 'TIME'):
        output.append(time(word[i]))    
    else:
        output.append(word[i])  

In [86]:
result = ' '.join(output)
print(str(result))

Payoneer was founded in two thousand five with two million dollars in seed funding from then c e o Yuval Tal and other private investors .


# Inflect engine converter

In [72]:
import inflect
import re

In [87]:
p = inflect.engine()
inflect_result = []
for i in word:
    if(re.match('.*[0-9]+[a-zA-z]*', i)):        
        inflect_result.append(p.number_to_words(re.sub("[^0-9]", "", i)))
    else:
        inflect_result.append(i)
print(inflect_result)

['Payoneer', 'was', 'founded', 'in', 'two thousand and five', 'with', 'two', 'in', 'seed', 'funding', 'from', 'then', 'CEO', 'Yuval', 'Tal', 'and', 'other', 'private', 'investors', '.']


# Play audio file in Jupyter notebook

In [17]:
import IPython.display as ipd

In [39]:
def switch_code(arg):
    switcher_engine = {
        78379 : "./Audio_files/output_date_engine.mp3",
        75885 : "./Audio_files/output_money_engine.mp3",
        28762 : "./Audio_files/output_fraction_engine.mp3",
        6038 : "./Audio_files/output_time_engine.mp3",
        1317 : "./Audio_files/output_telephone_engine.mp3",
        11748: "./Audio_files/output_combined_LETTERS_engine.mp3",
        3434: "./Audio_files/output_combined_Money_engine.mp3",
        11109: "./Audio_files/output_combined_DATE_engine.mp3",
        9873: "./Audio_files/output_cardinal_engine.mp3",
        10501: "./Audio_files/output_ordinal_engine.mp3"        
    } 
    
    #print(switcher.get(arg))
    return switcher_engine.get(arg)

In [40]:
def switch_lstm(arg):
    switcher_lstm = {
        78379 : "./Audio_files/output_date_lstm.mp3",
        75885 : "./Audio_files/output_money_lstm.mp3",
        28762 : "./Audio_files/output_fraction_lst.mp3",
        6038 : "./Audio_files/output_time_lstm.mp3",
        1317 : "./Audio_files/output_telephone_lstm.mp3",
        11748: "./Audio_files/output_combined_LETTERS_lstm.mp3",
        3434: "./Audio_files/output_combined_Money_lstm.mp3",
        11109 : "./Audio_files/output_combined_DATE_lstm.mp3",
        9873: "./Audio_files/output_cardinal_lstm.mp3",
        10501: "./Audio_files/output_ordinal_lstm.mp3"        
    }
    #print(switcher.get(arg))
    return switcher_lstm.get(arg)

In [88]:
word1 = ' '.join(word)
print("\n Input sentence:", word1)

inflect_result1 = ' '.join(inflect_result)
print("\n\n Ouput of inflect engine: ", inflect_result1)
print("\n\n Running Audio Generated for inflect Engine:")
ipd.Audio(switch_code(int(sent_id)))

#switch_code(sent_id)


 Input sentence: Payoneer was founded in 2005 with $2 million in seed funding from then CEO Yuval Tal and other private investors .


 Ouput of inflect engine:  Payoneer was founded in two thousand and five with two in seed funding from then CEO Yuval Tal and other private investors .


 Running Audio Generated for inflect Engine:


In [90]:
print("\n Ouput of LSTM: ", result)
print("\n\n Running Audio Generated for LSTM:")
ipd.Audio(switch_lstm(int(sent_id)))


 Ouput of LSTM:  Payoneer was founded in two thousand five with two million dollars in seed funding from then c e o Yuval Tal and other private investors .


 Running Audio Generated for LSTM:
