In [1]:
## Find the supplier of an OCR generated invoice using knn and augmented data from NLP

In [2]:
import logging
import pandas as pd
import ast
import sklearn
from sklearn.neighbors import NearestNeighbors    
from sklearn.feature_extraction.text import TfidfVectorizer
from joblib import dump, load
import re
import string

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 

import nlpaug.augmenter.char as nac

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     /home/mashallaryan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mashallaryan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/mashallaryan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Constants
# Knn file name
SKLEARN_MODEL_FILENAME = 'model.ml'

# Tfidf vectorizer file name
VECTORIZER_FILENAME = 'vect.ml'

AUG_FILE = 'augfile.csv'
SUP_FILE ='supfile.csv'
MAXNUM_AUG = 1000

LOG_FILE = 'logfile.log'

In [4]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
fh = logging.FileHandler(LOG_FILE)
fh.setLevel(logging.DEBUG)

ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)

logger.addHandler(fh)
logger.addHandler(ch)

In [5]:


def read_invoice(file_name):
    """
    Reads a file of invoice content created by OCR and converts it to a pandas dataframe
    
    :param file_name: Name of the file containing invoice content
    :return: A dataframe of the invoice words 
    
    """
    with open(file_name,'r') as f:
        words = "[{}]".format(f.read().strip())
    words = words.replace("\n", ",")
    words = ast.literal_eval(words)
    inv_df = pd.DataFrame(words)
    # sort the words according to their order in the original doc
    inv_df = inv_df.set_index(['page_id','line_id','pos_id'])
    inv_df.sort_index(inplace=True)
    
    return ' '.join(inv_df['word'])


lemat = WordNetLemmatizer()
def clean(item):
    """
    preprocessing the input string to remove the unwanted characters and substrings     
    :param item: input string
    :return: a string in which unwanted characters and substrings are removed 
    
    """
    res = item.encode("ascii", errors="ignore").decode()

    res = re.sub('['+string.punctuation+']+','',res).strip()
    res = word_tokenize(res.lower())
    res = [lemat.lemmatize(item) for item in res if item not in stopwords.words('english')]    
    return   " ".join(res)



def augment_data(suppliers):
    logger.info('Data Augmentation: this might take some time...')
    aug = nac.OcrAug()
    
    
    cols = suppliers.columns.values
    aug_df = []    
    for ind, row in suppliers.iterrows():    
        aug_df += [pd.DataFrame([[row['Id'],new_aug]],columns=cols) 
                   for new_aug in aug.augment(row['SupplierName'], n=MAXNUM_AUG) ]
    aug_df = pd.concat(aug_df+[suppliers]) 
    return aug_df 


def train(sup_df, do_augmentation=True ):
    
    # Create tfidf vectorizer
    vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(ngram_range=(1, 4), analyzer='word')
    if do_augmentation:
        
        # Generate extra supplier names by adding som OCR style noise to the original suppliers' names
        aug_sup_df = augment_data(sup_df)
        
        # generate TF-IDF features fot the suppliers
        doc_term_matrix = vectorizer.fit_transform(aug_sup_df['SupplierName'])  
        
        # Save the augmented data somewhere
        aug_sup_df[['Id']].to_csv(AUG_FILE)
    else:    
        doc_term_matrix = vectorizer.fit_transform(sup_df['SupplierName'])  
        sup_df[['Id']].to_csv(AUG_FILE)
    
    # Save suppliers somewhere
    sup_df.to_csv(SUP_FILE)
    
    # Build a KNN model    
    nearestnbr = NearestNeighbors(n_neighbors=1).fit(doc_term_matrix ) 
    
    
    return nearestnbr, vectorizer

def predict(nearestnbr, vectorizer, query):
    
    # Load the required DataFrames
    aug_sup_df = pd.read_csv(AUG_FILE)
    sup_df = pd.read_csv(SUP_FILE)
    
    #Clean the query
    query = clean(query)
    
    #Convert the query to TF-IDF features
    query_tfidf = vectorizer.transform(query.split())    
    
    # Predict the most probable supplier
    distances, indices = nearestnbr.kneighbors(query_tfidf)
    
    
    suppliers_index = indices[distances.argmin()]
    sup_id = aug_sup_df['Id'].iloc[suppliers_index].values[0]
    return sup_id, sup_df[sup_df['Id']==sup_id]['SupplierName'].values[0]



# def main(args):
#     inv_file = args['invoice']
#     sup_file = args['suppliers']
    
#     # read suppliers names
#     sup_df = pd.read_csv(sup_file)
#     sup_df['SupplierName_c'] = sup_df['SupplierName'].apply(clean)
    
    
#     nearestnbr, vectorizer = train(sup_df, do_augmentation=True )

#     query = read_invoice(inv_file)
    
#     query = clean(query)
    
#     return predict(nearestnbr, vectorizer, query)



In [12]:
!pwd


/local/scratch/mashall/scratch/Myprojects/Find_supplier


In [10]:
sup_df = pd.read_csv('suppliernames.txt')

sup_df['SupplierName']  = sup_df['SupplierName'].apply(clean)


nearestnbr, vectorizer = train(sup_df, do_augmentation=True )


Data Augmentation: this might take some time...


In [11]:
query = read_invoice('invoice.txt')

query = clean(query)


predict(nearestnbr, vectorizer, query)

(3153303, 'demo company')