## Siamese Bidirectional LSTM for Finding the Supplier of OCR generated invoice

In [1]:
import logging
import pandas as pd
import ast
import numpy as np
from joblib import dump, load
import re
import string

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

K = tf.keras.backend
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 

import nlpaug.augmenter.char as nac

import tensorflow as tf
from tensorflow.keras import models
from tensorflow.keras import layers
import tensorflow.keras.regularizers  as reglzr
from nltk.corpus import reuters
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

nltk.download('reuters')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/mashallaryan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mashallaryan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/mashallaryan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package reuters to
[nltk_data]     /home/mashallaryan/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


True

In [2]:
MODEL_FILENAME = 'model.ml'

AUG_FILE = 'augfile.csv'
SUP_FILE ='supfile.csv'
MAXNUM_AUG = 1000

EMBEDDING_DIM = 10
VOCAB_SIZE = 5000
MAX_SEQ_LEN = 300
BATCH_SIZE = 100
LOG_FILE = 'logfile.log'

In [3]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
fh = logging.FileHandler(LOG_FILE)
fh.setLevel(logging.DEBUG)

ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)

logger.addHandler(fh)
logger.addHandler(ch)

In [4]:


def read_invoice(file_name):
    """
    Reads a file of invoice content created by OCR and converts it to a pandas dataframe
    
    :param file_name: Name of the file containing invoice content
    :return: A dataframe of the invoice words 
    
    """
    with open(file_name,'r') as f:
        words = "[{}]".format(f.read().strip())
    words = words.replace("\n", ",")
    words = ast.literal_eval(words)
    inv_df = pd.DataFrame(words)
    # sort the words according to their order in the original doc
    inv_df = inv_df.set_index(['page_id','line_id','pos_id'])
    inv_df.sort_index(inplace=True)
    
    return ' '.join(inv_df['word'])


lemat = WordNetLemmatizer()
def clean(item):
    """
    preprocessing the input string to remove the unwanted characters and substrings     
    :param item: input string
    :return: a string in which unwanted characters and substrings are removed 
    
    """
    res = item.encode("ascii", errors="ignore").decode()

    res = re.sub('['+string.punctuation+']+','',res).strip()
    res = word_tokenize(res.lower())
    res = [lemat.lemmatize(item) for item in res if item not in stopwords.words('english')]    
    return   " ".join(res)


def build_tokenizer(docs):
    logger.info('build tokenizer.')
    uwords = set()
    docs.str.lower().str.split().apply(uwords.update)
    vocab_size = len(uwords)
    tknzr = Tokenizer(num_words=vocab_size, oov_token="<OOV>")    
    tknzr.fit_on_texts(uwords)    
    return tknzr



In [5]:
def build_model(num_hidden=50, dropout=0.2, recurrent_dropout=0.2):
    
    # Create the shared encoder
    encoder = models.Sequential()
    encoder.add(layers.Embedding(VOCAB_SIZE,EMBEDDING_DIM ,input_shape=(MAX_SEQ_LEN,)) )
    encoder.add(layers.Bidirectional(layers.LSTM(num_hidden, kernel_regularizer=reglzr.l2(1e-4), dropout=dropout, recurrent_dropout=recurrent_dropout)))
    encoder.add(layers.Dense(num_hidden, kernel_regularizer=reglzr.l2(1e-4)))
    
    
    # Invoice input
    inv_in = layers.Input(shape=(MAX_SEQ_LEN,),dtype=tf.int32,name='inv_in')
    # Supplier input
    sup_in = layers.Input(shape=(MAX_SEQ_LEN,),dtype=tf.int32,name='sup_in')
    
    # Seperated encoders for invoice and supplier
    inv_encoder = encoder(inv_in)
    sup_encoder = encoder(sup_in)
    
    # loss layer
    L1_layer =  layers.Lambda(lambda tensors: K.abs(tensors[0] - tensors[1]))
    
    L1_distance = L1_layer([inv_encoder, sup_encoder])
    
    # Add a dense layer with a sigmoid unit to generate the similarity score
    prediction = layers.Dense(1,activation='sigmoid')(L1_distance)

    
    model = models.Model(inputs=[inv_in, sup_in], outputs=[prediction])
    model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(2e-3), metrics=['accuracy'])
    
    return model
    
    


In [6]:
def create_training_data(suppliers, n_samples_per_sup=10):
    """
    Given a dataframe of suppliers, creates a dataset of positive and negative pairs of invoice-supplier, given a dataframe of suppliers.
    
    :param suppliers: a dataframe (columns= ['Id', 'SupplierName' ]) of supplier names 
    :param n_samples_per_sup: number of positive and negative pairs generated per supplier name
    :return: a Dataframe (columns= ['invoice', 'SupplierName', 'label']) of invoice-supplier pairs 
             where the label=1 for a positive samples (invoice contains the supplier name) and label=0 
             if invoice does not contains the supplier name.
    """

    def inject_pos(item):
        point = np.random.randint(0,len(item['invoice'])-1)
        
        item_text = item['invoice']
        item_num = item['num']
        return item_text[0:point]+' {} '.format(supplier_names[item_num]) + item_text[point:]
    
    
    txt = nltk.Text(reuters.words())
    words =np.array(txt)[np.random.randint(0,1720901, 10000)]
    
    supplier_names = suppliers['SupplierName'].tolist()    
    num_suppliers = len(supplier_names)

    df_positive = []
    for i, row in suppliers.iterrows():
        df_positive += [pd.DataFrame([[i,row['Id'],' '.join(words[np.random.randint(0,10000, np.random.randint(10, MAX_SEQ_LEN))].tolist() )]],columns=['num','Id','invoice']) for i in range(n_samples_per_sup)]
    df_positive = pd.concat(df_positive)     
    
    
    df_negative = []
    for i, row in suppliers.iterrows():
        df_negative += [pd.DataFrame([[i,row['Id'],' '.join(words[np.random.randint(0,10005, np.random.randint(10, MAX_SEQ_LEN))].tolist() )]],columns=['num','Id','invoice']) for i in range(n_samples_per_sup)]
        break
    df_negative = pd.concat(df_negative)     
    

    df_positive['invoice'] = df_positive.apply(inject_pos,axis=1)   
    

    df_positive['label'] = 1
    df_negative['label'] = 0
    
    
    df_all = pd.concat([df_positive, df_negative])    
    df_all = df_all.join(suppliers.set_index('Id'), on='Id')
    
    return df_all[['invoice', 'SupplierName', 'label']]
    
    

    
    


In [7]:
def train(suppliers, epochs=50,  num_sample_per_supplier=10):
    dataset_df = create_training_data(suppliers, num_sample_per_supplier)
    
    X = dataset_df[['invoice', 'SupplierName']]
    Y = dataset_df['label']
    
    
    tknzr = build_tokenizer(suppliers['SupplierName'])
    invoices =tknzr.texts_to_sequences(X['invoice'].str.split().values)
    suppliers = tknzr.texts_to_sequences(X['SupplierName'].str.split().values)    
    invoices  = pad_sequences(invoices,maxlen=MAX_SEQ_LEN)
    suppliers  = pad_sequences(suppliers,maxlen=MAX_SEQ_LEN)

    dataset = tf.data.Dataset.from_tensor_slices(({'inv_in':tf.cast(invoices,tf.int32),'sup_in':tf.cast(suppliers,tf.int32)} ,tf.cast( Y.values, tf.int32)))
      
    dataset = dataset.shuffle(len(dataset_df)).batch(BATCH_SIZE)
    
    model = build_model()
    
    model.fit(dataset,  epochs=epochs)   
    
    return model, tknzr 
    

In [18]:

def predict(model, tknzr,suppliers, query):
    

    suppliers = tknzr.texts_to_sequences(suppliers['SupplierName'].str.split().values)    
    suppliers  =tf.cast( pad_sequences(suppliers,maxlen=MAX_SEQ_LEN), tf.int32)
    query = clean(query )
    
    query = tknzr.texts_to_sequences(query.split())
    query = tf.cast(pad_sequences(query,maxlen=MAX_SEQ_LEN),tf.int32)
    
    result =np.array([model.predict([query], [supplier]) for supplier in suppliers[:]])
    return np.argmax(result)

    
        
    

In [9]:
# Test
sup_df = pd.read_csv('suppliernames.txt')
sup_df['SupplierName'] = sup_df['SupplierName'].apply(clean)


query = read_invoice('invoice.txt')
query = clean(query)


model, tknzr  = train(suppliers =sup_df )


build tokenizer.


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [19]:

predict(model, tknzr,sup_df, query)

ValueError: Error when checking model input: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 2 array(s), but instead got the following list of 1 arrays: [<tf.Tensor: id=12352, shape=(117, 300), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 1],...