In [1]:
import pandas as pd
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
import spacy
from pathlib import Path

# Read in the test data for imgs
output_dir = Path("./models")
print("Loading spacy model from ", output_dir)
nlp = spacy.load(output_dir)
print("Finished Loading")

Loading spacy model from  models
Finished Loading


In [56]:
img_test_path = "./ocr_new.csv"

TEST_DATA = []
with open(img_test_path, 'r') as file:
    lines = [line.strip() for line in file.readlines()]
    
    # Annotate the data
    for line in lines[1:]:
        doc = nlp(line)
        content = line
        entities = {"entities": []}
        for ent in doc.ents:
            entities["entities"].append((ent.label_, ent.text))
        
        v = (content, entities)
        TEST_DATA.append(v)

In [58]:
# with open ('Data/spacy.txt', 'rb') as fp:
#     TEST_DATA = pickle.load(fp)
    
row_labels = []
for content, obj in TEST_DATA:
    entities = obj["entities"]
    labels = {
        "totalAmount": [],
        "totalLabel": [],
        "date": [],
        "address": [],
        "vendor": []
    }
    
    docId = content.split(",")[1]
    
    for entity in entities:
        #p = (int(entity[0]), int(entity[1]))
        #key = entity[2]
        #value = content[p[0]:p[1]].replace("$", "").strip()
        key = entity[0]
        value = entity[1]
        labels[key].append((docId, value))
    
    row_labels.append(labels)
    
bucket = {
        "amount": [],
        "totalLabel": [],
        "date": [],
        "vendor_address": [],
        "vendor_name": []
}

keyMap = {
    "totalAmount": "amount",
    "vendor": "vendor_name",
    "totalLabel": "totalLabel",
    "date": "date",
    "address": "vendor_address"
}

for item in row_labels:
    for k, v in item.items():
        bucket[keyMap[k]].extend(v)

In [59]:
import re

# Filter for numeric amounts
bucket["amount"] = list(filter(lambda x: re.match("\d+[.]\d+", x[1]), bucket["amount"]))

In [94]:
from sentence_transformers import SentenceTransformer

# Vectorize all of the fields
dataframes = []
model = SentenceTransformer('bert-base-nli-mean-tokens')

for k in bucket.keys():
    ids = [v[0] for v in bucket[k]]
    values = [v[1] for v in bucket[k]]
    
    entity_embeddings = model.encode(values)
    
    obj = {
        "documentid": ids, 
    }
    obj[k] = values
    if k != "amount":
        obj[k + "_embedding"] = [(e) for e in model.encode(values)]
    else:
        obj[k + "_embedding"] = [(np.array([float(e)])) for e in values]
    
    df = pd.DataFrame(obj)
    dataframes.append(df)

Downloading (…)a9bfc/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

Downloading (…)297d1a9bfc/README.md:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

Downloading (…)7d1a9bfc/config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)a9bfc/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

Downloading (…)297d1a9bfc/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)d1a9bfc/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [95]:
# Merge all the dfs together
merge = pd.merge(dataframes[0],dataframes[2],on='documentid',how='outer')
merge = pd.merge(merge,dataframes[3],on='documentid',how='outer')
merge = pd.merge(merge,dataframes[4],on='documentid',how='outer')
img_df = merge
img_df

Unnamed: 0,documentid,amount,amount_embedding,date,date_embedding,vendor_address,vendor_address_embedding,vendor_name,vendor_name_embedding
0,00d0243046961,9.65,[9.65],29/03/18,"[-0.6459081, 1.4347223, -0.4738709, -0.9654732...",BOOK CO. (M) SDN BHD,"[0.13748485, 0.01564531, 0.40901953, -0.939231...",,
1,00d0243046961,9.65,[9.65],29/03/18,"[-0.6459081, 1.4347223, -0.4738709, -0.9654732...","No 8, Jalan 7/118B, Desa Tun Razak 56000 Kuala...","[0.578864, 0.55168676, -0.109442174, -0.736170...",,
2,00d0164592408,6.00,[6.0],2018,"[-0.20909913, 1.8728611, 0.05629508, -0.401791...","No.2, Jalan Temenggung 19/9, Seksyen 9, Bandar...","[0.66079926, 1.2506262, 0.30348724, -0.5321358...",,
3,00d0142736377,35.00,[35.0],13-06-2018,"[0.36338586, 2.3204677, -0.05483611, -0.403898...","NO.1, TAMAN SRI DENGKIL, JALAN AIR HITAM 43800...","[0.67402095, 1.0543537, 0.5044794, -0.41738924...",THREE SEAFOOD RESTAURANT SDN BHD,"[0.30204177, -0.07559246, -0.34028977, 0.72516..."
4,00d0530724847,6.00,[6.0],19-03-2018,"[-0.026938912, 1.9348917, -0.29990977, -0.5566...","No.2, Jalan Temenggung 19/9, Seksyen 9, Bandar...","[0.66079926, 1.2506262, 0.30348724, -0.5321358...",,
...,...,...,...,...,...,...,...,...,...
722,00d0464742744,,,,,,,"0,00d0464742744,""SANYO STATIONERY SHOP NO. 31G...","[0.168012, 0.31575555, 0.2365051, -0.34233254,..."
723,00d0358154075,,,,,,,"STATIONERY SHOP NO. 31G&33G, JALAN SETIA INDAH...","[0.12828787, 0.30251566, 0.395815, -0.34087104..."
724,00d0937018331,,,,,,,"STATIONERY SHOP NO. 31G&33G, JALAN SETIA INDAH...","[0.12828787, 0.30251566, 0.395815, -0.34087104..."
725,00d0803190070,,,,,,,"STATIONERY SHOP NO. 31G&33G, JALAN SETIA INDAH...","[0.08707425, 0.28516972, 0.3332275, -0.3129031..."


In [96]:
from sklearn.metrics.pairwise import cosine_similarity
from numpy import linalg

dist_cols = ["image_doc_id", "user_doc_id", "amount_dist", "vendor_name_dist", "vendor_address_dist", "date_dist"]
cols = ["amount_embedding", "vendor_name_embedding", "vendor_address_embedding", "date_embedding"]

def getEmbedding(x):
    x_row = x.copy()
    cols = ['date', 'vendor_name','vendor_address']
    for col in cols:
        # Create a model for each col and make the embedding
        values = x_row[col]
        embeddings = model.encode(values)
        x_row[col + "_embedding"] = [(e) for e in embeddings]

    # The embeddings for amounts are just the amounts
    x_row["amount_embedding"] = [(np.array(float(x["amount"])))]
    return x_row

def getImageDists(x):
    # Get the embedding for the x value
    ref = getEmbedding(x)
    
    df = pd.DataFrame(columns=dist_cols)
    
    for i, row in img_df.iterrows():
        img_id = row["documentid"]
        user_id = ref['documentid']
        entry = {
           "image_doc_id": img_id,
            "user_doc_id": user_id 
        }
        
        for col in cols:
            value_col = col.rsplit("_", 1)[0]
            embed_col = col
                      

            dist = linalg.norm(np.array(ref[embed_col]) - np.array(row[embed_col]))
            
            entry[value_col + "_img"] = row[value_col]
            entry[value_col + "_user"] = ref[value_col]
            
            entry[value_col + "_dist"] = dist
        
        df = df.append(entry, ignore_index=True)
        
    return df

In [98]:
def match(x, n=1, w=[1, 1, 1, 1]):
    dists = getImageDists(x)
    dists["sum"] = (w[0] * dists["amount_dist"]) + (w[1] * dists["vendor_name_dist"]) + (w[2] * dists["vendor_address_dist"]) + (w[3] * dists["date_dist"])
    dists = dists.sort_values(["sum"])
    return dists.head(n)

def matches(x, w=[1, 1, 1, 1]):
    dists = getImageDists(x)
    dists["sum"] = (w[0] * dists["amount_dist"]) + (w[1] * dists["vendor_name_dist"]) + (w[2] * dists["vendor_address_dist"]) + (w[3] * dists["date_dist"])
    return dists.sort_values(["sum"])

In [None]:
from tqdm import tqdm

test = pd.read_csv("./dev_users.csv")

t = 0
n = len(test)
tested = .0001

bar = tqdm(test.iterrows())
for i, row in bar:
    res = match(row, 3, [1, 1, 3, 1])
    tested += 1
        
    if (res["image_doc_id"].values[0] == res["user_doc_id"].values[0]):
        print("match")
        print(res)
        t += 1
    else:
        print("no match")
        print(res)
        #print(m[m["image_doc_id"] == res["user_doc_id"]])
        if res["user_doc_id"].values[0] not in img_df["documentid"].values:
            n -= 1
            print(res["user_doc_id"], "not in imgs")
            tested -= 1
            
    print("Running Adj Acc:", ( t / tested))
        
        
        
print("Total Acc:", t, "/", len(test))
print("Adj. Acc:", t, "/", n)

1it [00:03,  3.78s/it]

no match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
194  00d0353443719  00d0320587587         0.44         13.354272   
109  00d0186887844  00d0320587587         2.70          9.987178   
193  00d0353443719  00d0320587587         2.20         13.354272   

     vendor_address_dist  date_dist amount_img  amount_user    date_img  \
194            10.898200  13.188985      29.24         28.8  09/03/2018   
109            10.129859  17.154823      26.10         28.8          18   
193            10.898200  13.188985      31.00         28.8  09/03/2018   

     date_user                                 vendor_address_img  \
194  2019-1-23  SRD FLR, AEON TAMAN MALURI SC JLN JEJAKA, TAMA...   
109  2019-1-23  LOT 1851-A & 1851-B, JALAN KPB 6, KAWASAN PERI...   
193  2019-1-23  SRD FLR, AEON TAMAN MALURI SC JLN JEJAKA, TAMA...   

                                   vendor_address_user       vendor_name_img  \
194  NO. 443, JALAN KURAU, SUNGAI RENGIT, 81620 PEN...      

2it [00:07,  3.63s/it]

no match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
35   00d0366175414  00d0333208815          0.6          0.000016   
184  00d0841246769  00d0333208815          1.1          0.000016   
66   00d0821795963  00d0333208815          3.0          0.000016   

     vendor_address_dist  date_dist amount_img  amount_user     date_img  \
35              0.000011  15.512054       7.60          8.2  18 Apr 2018   
184             0.000011  15.594954       7.10          8.2  03 Apr 2018   
66              0.000011  15.942965      11.20          8.2  05 May 2018   

     date_user                                 vendor_address_img  \
35   2018-4-16  12, Jalan Tampoi 7/4,Kawasan Perindustrian Tam...   
184  2018-4-16  12, Jalan Tampoi 7/4,Kawasan Perindustrian Tam...   
66   2018-4-16  12, Jalan Tampoi 7/4,Kawasan Perindustrian Tam...   

                                   vendor_address_user  \
35   12, JALAN TAMPOI 7/4,KAWASAN PERINDUSTRIAN TAM...   
184  12, JALAN TAMPO

3it [00:11,  3.82s/it]

no match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
99   00d0192025858  00d0116292534         2.10         19.955214   
317  00d0569507667  00d0116292534         1.48         23.240124   
111  00d0449498451  00d0116292534         0.68         20.101362   

     vendor_address_dist  date_dist amount_img  amount_user    date_img  \
99             13.444457  13.359706      11.68        13.78  05/08/2017   
317            12.234123  15.995770      12.30        13.78  14/04/2018   
111            13.803770  15.411748      13.10        13.78  18/03/2018   

    date_user                                 vendor_address_img  \
99   2017-7-7  GROUND FLOOR, NO. 4 & 6, JALAN SS 15/4B, 47500...   
317  2017-7-7  LOT G29 & G30, AEON Mall TEBRAU CITY, NO, 1, J...   
111  2017-7-7  Level’6, Bangunan TH, Damansara Uptown3 No.3, ...   

                                   vendor_address_user  \
99   LOFT B-005-006, BASEMENT LEVEL 1 THE STARLING ...   
317  LOFT B-005-006, BASEMEN

4it [00:15,  3.81s/it]

no match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
338  00d0797361085  00d0973462000         1.64         19.164242   
337  00d0797361085  00d0973462000         1.64         19.164242   
193  00d0353443719  00d0973462000         2.50         18.969934   

     vendor_address_dist  date_dist amount_img  amount_user    date_img  \
338            15.302782  10.055247      35.14         33.5  01/10/2017   
337            15.302782  10.055247      35.14         33.5  01/10/2017   
193            13.408449  16.698231      31.00         33.5  09/03/2018   

     date_user                                 vendor_address_img  \
338  2017-1-11  Lot 3, Jalan Pelabur 23/1, 40300 Shah Alam, Se...   
337  2017-1-11  Lot 3, Jalan Pelabur 23/1, 40300 Shah Alam, Se...   
193  2017-1-11  SRD FLR, AEON TAMAN MALURI SC JLN JEJAKA, TAMA...   

                                   vendor_address_user  \
338  NO. 31G&33G, JLAAN SETIA INDAH X ,U13/X 4017 0...   
337  NO. 31G&33G, JLAAN 

5it [00:18,  3.79s/it]

match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
214  00d0280864237  00d0280864237         0.00          0.000009   
51   00d0677299556  00d0280864237         2.32         20.705523   
229  00d0338048914  00d0280864237        47.70          0.000009   

     vendor_address_dist  date_dist amount_img  amount_user    date_img  \
214             4.068748   8.345243     174.90        174.9  31/03/2018   
51              8.870045  18.492744     177.22        174.9    11-05-16   
229             3.848344  11.243047     127.20        174.9  06/03/2018   

     date_user                                 vendor_address_img  \
214  2018-3-31     LOT 276 JALAN BANTING 43800 DENGKIL, SELANGOR.   
51   2018-3-31  LOT P.T. 33196, BATU 4 JALAN KAPAR, MUKIM KAPA...   
229  2018-3-31     LOT 276 JALAN BANTING 43800 DENGKIL, SELANGOR,   

                                vendor_address_user        vendor_name_img  \
214  LOT 276 JLAN BANTINNG 43800 DENGKIL, SELANGOR.  KEDAI PAPAN 

6it [00:23,  3.98s/it]

no match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
265  00d0734092781  00d0330285744          2.4          8.486708   
257  00d0916574976  00d0330285744          0.1          8.765873   
203  00d0652046562  00d0330285744          6.0          8.155969   

     vendor_address_dist  date_dist amount_img  amount_user   date_img  \
265             2.209239  13.278356      12.30          9.9  24 -03-18   
257             2.266804  17.287016      10.00          9.9         18   
203             2.266804  16.220646       3.90          9.9      11-17   

      date_user                                 vendor_address_img  \
265  2017-12-28  LOT 1851-A & 1851-B, JALAN KPB 6, KAWASAN PERI...   
257  2017-12-28  LOT 1851-A & 1851-B, JALAN KPB 6, KAWASAN PERI...   
203  2017-12-28  LOT 1851-A & 1851-B, JALAN KPB 6, KAWASAN PERI...   

                                   vendor_address_user  \
265  LOT 1851-A & 1851-B, JALAN KPB 6, KAWASAN PERI...   
257  LOT 1851-A & 1851-B

7it [00:27,  3.98s/it]

match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
34   00d0167381962  00d0167381962         0.33          4.154698   
208  00d0795468800  00d0167381962         3.40         17.666758   
263  00d0225816395  00d0167381962         3.46         12.521903   

     vendor_address_dist  date_dist amount_img  amount_user    date_img  \
34              4.017047   8.278746       5.57          5.9  22/03/2018   
208             9.684624  12.103020       2.50          5.9  03/05/2018   
263            10.442649  15.329567       9.36          5.9  17/08/2017   

     date_user                                 vendor_address_img  \
34   2018-3-22  3 FLOOR, AEON TAMAN MALURI SC JLN JEJAKA, TAMA...   
208  2018-3-22  69, JALAN TEMENGGUNG 3/9 BANDAR MAHKOTA CHERAS...   
263  2018-3-22  Lot 3, Jalan Pelabur 23/1, 40300 Shalt Alam, S...   

                                   vendor_address_user        vendor_name_img  \
34   3 FLOOR, AENO TAMAN MALURI SC JLN JEJAKA, TAMA...      AE

8it [00:31,  3.96s/it]

no match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
124  00d0622420051  00d0502129574         9.75          0.000014   
193  00d0353443719  00d0502129574        17.35          0.000014   
120  00d0622420051  00d0502129574        17.75          0.000014   

     vendor_address_dist  date_dist amount_img  amount_user    date_img  \
124             8.946380  13.184356      38.60        48.35  12/06/2018   
193             8.909942  12.410274      31.00        48.35  09/03/2018   
120             8.946380  13.184356      30.60        48.35  12/06/2018   

     date_user                                 vendor_address_img  \
124  2018-4-22  SRO FLR, AEON TAMAN MALURI SC JLN JEJAKA, TAMA...   
193  2018-4-22  SRD FLR, AEON TAMAN MALURI SC JLN JEJAKA, TAMA...   
120  2018-4-22  SRO FLR, AEON TAMAN MALURI SC JLN JEJAKA, TAMA...   

                                   vendor_address_user   vendor_name_img  \
124  3RD FLR, AEON TAMAN MALURI SC JLN JEJAKA, TAMA...  AEON CO.

In [79]:
matchs(test.iloc[9]).to_csv("Model_Example.csv")