In [49]:
from __future__ import unicode_literals, print_function
import pickle
import plac
import random
from pathlib import Path

import pkg_resources
pkg_resources.require("spacy==2.3.5")
import spacy
from spacy.util import minibatch, compounding


n_iter = 20
LABEL = ["vendor", "totalAmount", "totalLabel", "address", "date"]

with open ('Data/spacy.txt', 'rb') as fp:
    TRAIN_DATA = pickle.load(fp)
    
nlp = spacy.blank('en')
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner)

for i in LABEL:
    ner.add_label(i)
    

optimizer = nlp.begin_training()

for itn in range(n_iter):
    random.shuffle(TRAIN_DATA)
    losses = {}
    batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(texts, annotations, sgd=optimizer, drop=0.1,
                   losses=losses)
    print('Losses', losses)

Losses {'ner': 16978.71791008512}


KeyboardInterrupt: 

In [36]:
test_text = """
0,0,"DE LUXE CIRCLE FRESH MART SDN BHD (MUTIARA RINI 16) GST NO:001507647488 CO REG NO:797887-W NO, 89&91, JALAN UTAMA, TAMAN MUTIA RINI, 81300 SKUDAI, JOHOR. TEL:016-7780546 MT161201805120055 12/05/18 12:12:57 PM CHU PECK CASHIER: 12/05/18 12:13:17 PM HEAVEN & EARTH AYATAKA GREEN TEA 1.5L 8888002119454 5.00*1 5.00 S KONNYAKU 10G 7.00 S 8888338001119 3.50*2 12.00 2 TOTAL WITH GST @ 6% ITEM: 3 ROUNDING 0.00 QTY: TOTAL SAVING: 0.00 12.00 TOTAL TENDER CASH 50.00 38.00 CHANGE TAX AMOUNT GOODS GST ANALYSIS S = 6% 11.32 0.68 Z = 0% 0.00 0.00 POINTS EARNED: 11 MEMBER 0000036581 MEMBER: WONG SHOO YUEN *THANK YOU. SEE YOU AGAIN !! *CUSTOMER CARE LINE : 012-7092889 *CUSTOMERSERVICE@DELUXEGROUPS.COM"
0,00d0608566104,"SANYU STATIONERY SHOP NO. 31G&33G, JALAN SETIA INDAH X ,U13/X 40170 SETIA ALAM MOBILE /WHATSAPPS : +6012-918 7937 TEL: +603-3362 4137 GST ID NO: 001531760640 TAX INVOICE OWNED BY : SANYU SUPPLY SDN BHD (1135772-K) CASH SALES COUNTER 1. 2012-0029 RESTAURANT ORDER CHIT NCR 3.5""X6"" 3 X 2.9000 8.70 SR TOTAL SALES INCLUSIVE GST @6% 8.70 DISCOUNT 0.00 TOTAL 8.70 ROUND ADJ 0.00 FINAL TOTAL 8.70 CASH 10.00 CHANGE 1.30 AMOUNT(RM) GST SUMMARY TAX(RM) SR @ 6% 8.21 0.49 INV NO: CS-SA-0079030 DATE: 18/04/2017 GOODS SOLD ARE NOT RETURNABLE & REFUNDABLE THANK YOU FOR YOUR PATRONAGE PLEASE COME AGAIN. TERIMA KASIH SILA DATANG LAGI ** PLEASE KEEP THIS RECEIPT FOR PROVE OF PURCHASE DATE FOR I.T PRODUCT WARRANTY PURPOSE ** FOLLOW US IN FACEBOOK : SANYU.STATIONERY"
"""
doc = nlp(test_text)
for ent in doc.ents:
    print(ent.label_, ":", ent.text)

vendor : MART SDN BHD
totalAmount : 7780546
date : 12/05/18
address : NO. 31G&33G, JALAN SETIA INDAH X ,U13/X 40170 SETIA ALAM
totalAmount : 8.70
totalLabel : FINAL TOTAL
totalAmount : 8.70
date : 18/04/2017


In [46]:
import pandas as pd

test = pd.read_csv("ocr_test.csv")
for i, row in test.iterrows():
    print("\nDoc", i)
    content = row[2]
    doc = nlp(content)
    for ent in doc.ents:
        print(ent.label_, ":", ent.text)


Doc 0
vendor : MR D.T.Y. (JOHOR) SDN BHD
address : LOT 1851-A & 1851-B, JALAN KPB 6, KAWASAN PERINDUSTRIAN BALAKONG, 43300 SERI KEMBANGAN, SELANGOR
totalLabel : TOTAL ROUNDED
totalAmount : 33.90

Doc 1
vendor : MENARA DION #02-03, LEVEL 2, 27, JALAN SULTAN ISMAIL, 50250 KUALA LUMPUR.
date : 25/06/18
date : 25/06/18

Doc 2
vendor : GERBANG ALAF RESTAURANTS SDN BHD
vendor : LICENSEE OF MCDONALD'S
address : LEVEL 6, BANGUNAN TH, DAMANSARA UPTOWN3 NO.3, JALAN SS21/39,47400 PETA ING JAYA SELANGOR
vendor : MCDONALD'S PERMAS JAYA
date : 25/05/2018
totalAmount : 11.10
totalLabel : TOTAL ROUNDED
totalAmount : 21.10

Doc 3
vendor : AEON CO.
address : 3RD FLR, AEON TAMAN MALURI SC JLN JEJAKA, TAMAN MALURI CHERAS, 55100 KUALA LUMPUR
totalAmount : 122.80
totalLabel : TOTAL AFTER ADJ INCL GST

Doc 4
vendor : 99 SPEED MART S/B
address : LOT P.T. 2811, JALAN ANGSA, TAMAN BERKELEY 41150 KLANG, SELANGOR
date : 20-02-18
totalLabel : TOTAL SALES (INCLUSIVE GST)

Doc 5
vendor : GARDENIA BAKERIES (KL) SDN 