In [1]:
import pandas as pd
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
import spacy
from pathlib import Path

# Read in the test data for imgs
output_dir = Path("./models")
print("Loading spacy model from ", output_dir)
nlp = spacy.load(output_dir)
print("Finished Loading")

Loading spacy model from  models
Finished Loading


In [137]:
img_test_path = "./ocr_all.csv"

TEST_DATA = []
with open(img_test_path, 'r') as file:
    lines = [line.strip() for line in file.readlines()]
    
    # Annotate the data
    for line in lines[1:]:
        doc = nlp(line)
        content = line
        entities = {"entities": []}
        for ent in doc.ents:
            entities["entities"].append((ent.label_, ent.text))
        
        v = (content, entities)
        TEST_DATA.append(v)

In [138]:
# with open ('Data/spacy.txt', 'rb') as fp:
#     TEST_DATA = pickle.load(fp)
    
row_labels = []
for content, obj in TEST_DATA:
    entities = obj["entities"]
    labels = {
        "totalAmount": [],
        "totalLabel": [],
        "date": [],
        "address": [],
        "vendor": []
    }
    
    docId = content.split(",")[1]
    
    for entity in entities:
        #p = (int(entity[0]), int(entity[1]))
        #key = entity[2]
        #value = content[p[0]:p[1]].replace("$", "").strip()
        key = entity[0]
        value = entity[1]
        labels[key].append((docId, value))
    
    row_labels.append(labels)
    
bucket = {
        "amount": [],
        "totalLabel": [],
        "date": [],
        "vendor_address": [],
        "vendor_name": []
}

keyMap = {
    "totalAmount": "amount",
    "vendor": "vendor_name",
    "totalLabel": "totalLabel",
    "date": "date",
    "address": "vendor_address"
}

for item in row_labels:
    for k, v in item.items():
        bucket[keyMap[k]].extend(v)

In [139]:
import re

# Filter for numeric amounts
bucket["amount"] = list(filter(lambda x: re.match("\d+[.]\d+", x[1]), bucket["amount"]))

In [140]:
from sentence_transformers import SentenceTransformer

# Vectorize all of the fields
dataframes = []
model = SentenceTransformer('bert-base-nli-mean-tokens')

for k in bucket.keys():
    ids = [v[0] for v in bucket[k]]
    values = [v[1] for v in bucket[k]]
    
    entity_embeddings = model.encode(values)
    
    obj = {
        "documentid": ids, 
    }
    obj[k] = values
    if k != "amount":
        obj[k + "_embedding"] = [(e) for e in model.encode(values)]
    else:
        obj[k + "_embedding"] = [(np.array([float(e)])) for e in values]
    
    df = pd.DataFrame(obj)
    dataframes.append(df)

In [143]:
# Merge all the dfs together
merge = pd.merge(dataframes[0],dataframes[2],on='documentid',how='outer')
merge = pd.merge(merge,dataframes[3],on='documentid',how='outer')
merge = pd.merge(merge,dataframes[4],on='documentid',how='outer')
img_df = merge
img_df

Unnamed: 0,documentid,amount,amount_embedding,date,date_embedding,vendor_address,vendor_address_embedding,vendor_name,vendor_name_embedding
0,00d0243046961,9.65,[9.65],29/03/18,"[0.5429581, -0.0665234, 2.096464, 0.3384345, 0...",BOOK CO. (M) SDN BHD,"[-0.30416343, 0.39372644, 1.096122, -0.1557415...",,
1,00d0243046961,9.65,[9.65],29/03/18,"[0.5429581, -0.0665234, 2.096464, 0.3384345, 0...","NO 8, JALAN 7/118B, DESA TUN RAZAK 56000 KUALA...","[-0.08186542, 0.43657455, 0.7416691, 0.2938639...",,
2,00d0164592408,6.00,[6.0],05-03-2018,"[0.16170992, 0.006006032, 2.6065888, 0.2249980...","NO.2, JALAN TEMENGGUNG 19/9, SEKSYEN 9, BANDAR...","[0.17609178, 0.24673975, 0.31823435, 0.2454735...",,
3,00d0696518680,7.00,[7.0],03/02/2018,"[0.5998782, 0.042366665, 2.5505097, 0.5825793,...",NO 7. SIMPANG OFF BATU VILLAGE. JALAN IPOH BAT...,"[-0.008996537, 0.60043174, 0.5059461, 0.335876...",SENG THO HARDWARE TRADING,"[0.5843093, 1.0965058, 1.344565, 0.30423525, 0..."
4,00d0142736377,35.00,[35.0],13-06-2018,"[-0.008761449, 0.13843873, 2.4405327, 0.082088...","NO.1, TAMAN SRI DENGKIL, JALAN AIR HITAM 43800...","[0.061991673, 0.8515016, 0.2755021, 0.21966623...",THREE SEAFOOD RESTAURANT SDN BHD,"[0.3343588, 0.7696248, 0.338173, -0.6639876, 0..."
...,...,...,...,...,...,...,...,...,...
661,00d0531848355,,,,,"12, JALAN TAMPOI 7/4,KAWASAN PERINDUSTRIAN TAM...","[0.49348155, 0.7787099, 0.49291992, -0.2070064...",SDN BHD 18 JUN 2018,"[0.9588894, 0.36526513, 2.5297034, -0.04193136..."
662,00d0147748424,,,,,"12, JALAN TAMPOI 7/4,KAWASAN PERINDUSTRIAN TAM...","[0.49348173, 0.77871, 0.49292004, -0.20700623,...",INTERNATIONAL SDN BHD 11 JUN 2018,"[0.62450725, 0.40023202, 1.8833292, 0.06522257..."
663,00d0178057064,,,,,"NO.1, JALAN PERMAS 10/5, BANDAR BARU PERMAS JA...","[-0.031045718, 1.0091283, 0.015718937, -0.2923...",CAKE HOUSE SDN BHD 862725-U,"[0.105295576, 1.6263118, 0.78402436, -0.367346..."
664,00d0961798581,,,,,"LOT 2942 & 2945, JLN SERI SENTOSA 8, SERI SENT...","[0.0834254, 0.30150214, 1.0114563, 0.38561654,...",CHECKERS HYPERMARKET SDN BHD,"[0.2789244, 0.7458712, 1.4719939, 0.25180003, ..."


In [144]:
from sklearn.metrics.pairwise import cosine_similarity
from numpy import linalg

dist_cols = ["image_doc_id", "user_doc_id", "amount_dist", "vendor_name_dist", "vendor_address_dist", "date_dist"]
cols = ["amount_embedding", "vendor_name_embedding", "vendor_address_embedding", "date_embedding"]

def getEmbedding(x):
    x_row = x.copy()
    cols = ['date', 'vendor_name','vendor_address']
    for col in cols:
        # Create a model for each col and make the embedding
        values = x_row[col]
        embeddings = model.encode(values)
        x_row[col + "_embedding"] = [(e) for e in embeddings]

    # The embeddings for amounts are just the amounts
    x_row["amount_embedding"] = [(np.array(float(x["amount"])))]
    return x_row

def getImageDists(x):
    # Get the embedding for the x value
    ref = getEmbedding(x)
    
    df = pd.DataFrame(columns=dist_cols)
    
    for i, row in img_df.iterrows():
        img_id = row["documentid"]
        user_id = ref['documentid']
        entry = {
           "image_doc_id": img_id,
            "user_doc_id": user_id 
        }
        
        for col in cols:
            value_col = col.rsplit("_", 1)[0]
            embed_col = col
                      

            dist = linalg.norm(np.array(ref[embed_col]) - np.array(row[embed_col]))
            
            entry[value_col + "_img"] = row[value_col]
            entry[value_col + "_user"] = ref[value_col]
            
            entry[value_col + "_dist"] = dist
        
        df = df.append(entry, ignore_index=True)
        
    return df

In [145]:
def match(x, n=1, w=[1, 1, 1, 1]):
    dists = getImageDists(x)
    dists["sum"] = (w[0] * dists["amount_dist"]) + (w[1] * dists["vendor_name_dist"]) + (w[2] * dists["vendor_address_dist"]) + (w[3] * dists["date_dist"])
    dists = dists.sort_values(["sum"])
    return dists.head(n)

In [None]:
from tqdm import tqdm

test = pd.read_csv("./test_transactions.csv")

t = 0
n = len(test)
tested = .0001

bar = tqdm(test.iterrows())
for i, row in bar:
    res = match(row, 3, [1, 1, 1, 1])
    tested += 1
        
    if (res["image_doc_id"].values[0] == res["user_doc_id"].values[0]):
        print("match")
        print(res)
        t += 1
    else:
        print("no match")
        print(res)
        #print(m[m["image_doc_id"] == res["user_doc_id"]])
        if res["user_doc_id"].values[0] not in img_df["documentid"].values:
            n -= 1
            print(res["user_doc_id"], "not in imgs")
            tested -= 1
            
    print("Running Adj Acc:", ( t / tested))
        
        
        
print("Total Acc:", t, "/", len(test))
print("Adj. Acc:", t, "/", n)

1it [00:04,  4.15s/it]

no match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
455  00d0243521534  00d0472780579          2.5         10.893335   
456  00d0243521534  00d0472780579          2.5         10.893335   
127  00d0340785534  00d0472780579          0.3         13.178209   

     vendor_address_dist  date_dist amount_img  amount_user    date_img  \
455             8.329150  11.646407       5.30          7.8  19/03/2018   
456             8.329150  11.646407       5.30          7.8  19/03/2018   
127             7.687895  12.365294       8.10          7.8    17-06-18   

    date_user                                 vendor_address_img  \
455  2018-4-4  NO.2, GROUND FLOOR, JALAN REKO SENTRAL 8 TAMAN...   
456  2018-4-4  NO.2, GROUND FLOOR, JALAN REKO SENTRAL 8 TAMAN...   
127  2018-4-4  LOT 1851-A & 1851-B, JALAN KP8 6, KAWASAN PERI...   

                                   vendor_address_user        vendor_name_img  \
455  NO.2, JALAN TEMENGUNG 19/9, SEKSYEM 9, BANDAR ...  RESTORA

2it [00:07,  3.88s/it]

match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
562  00d0269258457  00d0269258457          0.1         10.437742   
307  00d0552375573  00d0269258457          3.9         10.437742   
316  00d0714317145  00d0269258457          5.9         10.437742   

     vendor_address_dist  date_dist amount_img  amount_user    date_img  \
562             1.008541   9.610733      24.00         24.1  15/09/2017   
307             1.008541   9.827070      28.00         24.1  04/09/2017   
316             1.008541   8.934736      30.00         24.1  12/09/2017   

     date_user                                 vendor_address_img  \
562  2017-9-13  NO. 17-G, JALAN SETIA INDAH (X) U13/X, SETIA A...   
307  2017-9-13  NO. 17-G, JALAN SETIA INDAH (X) U13/X, SETIA A...   
316  2017-9-13  NO. 17-G, JALAN SETIA INDAH (X) U13/X, SETIA A...   

                                   vendor_address_user  \
562  NO. 17-G, JALAN SETIA INDAH (X) U13/X, SETIA A...   
307  NO. 17-G, JALAN SETIA 

3it [00:11,  3.59s/it]

no match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
64   00d0220981260  00d0145054354          0.0         14.107836   
63   00d0220981260  00d0145054354          0.0         14.107836   
468  00d0923796657  00d0145054354          1.0         14.107836   

     vendor_address_dist  date_dist amount_img  amount_user    date_img  \
64              7.992279   8.006345       7.00          7.0  29/12/2017   
63              8.671627   8.006345       7.00          7.0  29/12/2017   
468             7.992279   9.774500       6.00          7.0  30/03/2017   

      date_user                                 vendor_address_img  \
64   2017-11-12  NO. 17-G, JALAN SETIA INDAH (X) U13/X, SETIA A...   
63   2017-11-12                      (SETIA ALAM) SDN BHD 822737-X   
468  2017-11-12  NO. 17-G, JALAN SETIA INDAH (X) U13/X, SETIA A...   

                                   vendor_address_user  \
64   NO. 31G&33G, JALAN SEITA INDAH X ,U13/X 40170 ...   
63   NO. 31G&33G, JA

4it [00:14,  3.41s/it]

no match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
79   00d0955195431  00d0795468800         0.71         13.311802   
149  00d0564199588  00d0795468800         0.00         15.611422   
354  00d0948775947  00d0795468800         1.20         14.826888   

     vendor_address_dist  date_dist amount_img  amount_user    date_img  \
79              8.409048  12.197181       3.21          2.5  16/09/2017   
149             8.448511  11.634913       2.50          2.5  25/12/2017   
354             8.402262  12.103684       3.70          2.5  20/04/2018   

    date_user                                 vendor_address_img  \
79   2018-3-5  LOT 3, JALAN PELABUR 23/1, 40300 SHAH ALAM, SE...   
149  2018-3-5  NO. 17-G, JALAN SETIA INDAH (X) U13/X, SETIA A...   
354  2018-3-5  16A, JALAN ASTAKA U8/83, BUKIT JELUTONG 40150 ...   

                                   vendor_address_user  \
79   69, JALANTEMENGGUNG 3/9 BANDAR MAHKOTA CHERAS ...   
149  69, JALANTEMENGGUNG 3/9

5it [00:17,  3.31s/it]

no match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
112  00d0245826221  00d0682248746          4.5          7.878486   
46   00d0716702145  00d0682248746          4.3          7.878486   
260  00d0744539202  00d0682248746          6.9          3.231300   

     vendor_address_dist  date_dist amount_img  amount_user  date_img  \
112             3.261534   9.110478      23.40         27.9  04-06-18   
46              3.478273   9.367435      23.60         27.9  02-05-18   
260             3.176922  12.478758      21.00         27.9  21-05-17   

     date_user                                 vendor_address_img  \
112  2018-1-28  LOP P.T. 2811, JALAN ANGSA, TAMAN BERKELEY 411...   
46   2018-1-28  LOT P.T. 2811, JALAN ANGSA, TAMAN BERKELEY 411...   
260  2018-1-28  LOT P.T. 2811, JALAN ANGSA, TAMAN BERKELEY 411...   

                                   vendor_address_user   vendor_name_img  \
112  LOT P.T. 2811, JALAN ANGSA, TAAMN BERKELEY 411...     SPEED MART S/

6it [00:20,  3.19s/it]

match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
406  00d0471112949  00d0471112949        0.015          4.444289   
493  00d0223598140  00d0471112949        2.285         11.511184   
380  00d0472913563  00d0471112949        1.465         12.283155   

     vendor_address_dist  date_dist amount_img  amount_user    date_img  \
406             1.734021  10.978302      25.85       25.865    02/06/18   
493             6.818001   9.393649      28.15       25.865  16/06/2018   
380             6.717823  10.003391      24.40       25.865  07/05/2018   

    date_user                                 vendor_address_img  \
406  2018-2-6  NO.89&91, JALAN UTAMA, TAMAN MUTIA RINI, 81300...   
493  2018-2-6  LOT 2110&2111 JALAN PERMAS UTARA BANDAR BARU P...   
380  2018-2-6  JALAN PERMAS UTARA 1, PERMAS JAYA 81750 MASAI ...   

                                   vendor_address_user  \
406  NO.89&91, JALAN UTAMA, TAMANMUTIA RINI, 82130 ...   
493  NO.89&91, JALAN UTAMA, TAM

7it [00:23,  3.09s/it]

no match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
493  00d0223598140  00d0472591457         0.15         11.945459   
419  00d0572439292  00d0472591457         0.30         10.936616   
307  00d0552375573  00d0472591457         0.30         14.057550   

     vendor_address_dist  date_dist amount_img  amount_user    date_img  \
493             9.140021   7.143977      28.15         28.3  16/06/2018   
419             9.105876   8.953890      28.00         28.3    26-03-18   
307             7.562809   8.594392      28.00         28.3  04/09/2017   

     date_user                                 vendor_address_img  \
493  2018-6-17  LOT 2110&2111 JALAN PERMAS UTARA BANDAR BARU P...   
419  2018-6-17  LOT 1851-A & 1851-B, JALAN KPB 6, KAWASAN PERI...   
307  2018-6-17  NO. 17-G, JALAN SETIA INDAH (X) U13/X, SETIA A...   

                                   vendor_address_user  \
493  NO.29M, JALAN DINAR G U3/G, SEKSYEN U3, SUBANG...   
419  NO.29M, JALAN DINAR

8it [00:26,  3.10s/it]

match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
482  00d0354593861  00d0354593861          0.6          5.573689   
481  00d0354593861  00d0354593861          0.6          5.573689   
214  00d0965540723  00d0354593861          0.5          8.870738   

     vendor_address_dist  date_dist amount_img  amount_user    date_img  \
482             2.792207   8.725611      48.00         48.6  11/06/2018   
481             2.792207   8.725611      48.00         48.6  11/06/2018   
214            10.825954   9.986041      48.10         48.6    25-04-18   

     date_user                                 vendor_address_img  \
482  2018-11-6  NO.50, JALAN PBS 14/11, KAWASAN PERINDUSTRIAN ...   
481  2018-11-6  NO.50, JALAN PBS 14/11, KAWASAN PERINDUSTRIAN ...   
214  2018-11-6  LOT 1851-A & 1851-B, JALAN KPB 6, KAWASAN PERI...   

                                   vendor_address_user  \
482  MNO.50 , JALAN PBS 14/11 , AWASAN PERINDUSTRIA...   
481  MNO.50 , JALAN PBS 14/

9it [00:29,  3.05s/it]

no match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
455  00d0243521534  00d0179226173          2.5          9.871409   
456  00d0243521534  00d0179226173          2.5          9.871409   
127  00d0340785534  00d0179226173          0.3         12.860557   

     vendor_address_dist  date_dist amount_img  amount_user    date_img  \
455             8.410196   9.219789       5.30          7.8  19/03/2018   
456             8.410196   9.219789       5.30          7.8  19/03/2018   
127             7.758326   9.762505       8.10          7.8    17-06-18   

     date_user                                 vendor_address_img  \
455  2018-11-6  NO.2, GROUND FLOOR, JALAN REKO SENTRAL 8 TAMAN...   
456  2018-11-6  NO.2, GROUND FLOOR, JALAN REKO SENTRAL 8 TAMAN...   
127  2018-11-6  LOT 1851-A & 1851-B, JALAN KP8 6, KAWASAN PERI...   

                                   vendor_address_user        vendor_name_img  \
455  NO.2, JALAN TEMENGUNG 19/9,  SEKSYEN 9, BANDAR...  RES

10it [00:32,  2.99s/it]

match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
365  00d0277307035  00d0277307035         0.00          9.008004   
409  00d0338271694  00d0277307035         6.72          9.008004   
255  00d0950145735  00d0277307035         7.00          9.008004   

     vendor_address_dist  date_dist amount_img  amount_user    date_img  \
365              1.31885   6.062283      59.00         59.0  14-06-2018   
409              1.31885   9.055919      65.72         59.0  28-04-2018   
255              1.31885   9.050431      52.00         59.0  05-06-2018   

     date_user                                 vendor_address_img  \
365  2018-6-14  NO.1, TAMAN SRI DENGKIL, JALAN AIR HITAM 43800...   
409  2018-6-14  NO.1, TAMAN SRI DENGKIL, JALAN AIR HITAM 43800...   
255  2018-6-14  NO.1, TAMAN SRI DENGKIL, JALAN AIR HITAM 43800...   

                                   vendor_address_user  \
365  NO. 1, TAMAN SRI DENGKIL, JALAN AIR HITAM 4380...   
409  NO. 1, TAMAN SRI DENGK

11it [00:34,  2.91s/it]

no match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
455  00d0243521534  00d0902256238         1.40          9.871409   
456  00d0243521534  00d0902256238         1.40          9.871409   
70   00d0924594862  00d0902256238         0.45         12.860557   

     vendor_address_dist  date_dist amount_img  amount_user    date_img  \
455             8.797132  11.154868       5.30          6.7  19/03/2018   
456             8.797132  11.154868       5.30          6.7  19/03/2018   
70              8.740203  11.835114       7.15          6.7    15-07-18   

    date_user                                 vendor_address_img  \
455  2018-2-4  NO.2, GROUND FLOOR, JALAN REKO SENTRAL 8 TAMAN...   
456  2018-2-4  NO.2, GROUND FLOOR, JALAN REKO SENTRAL 8 TAMAN...   
70   2018-2-4  LOT 1851-A & 1851-B, JALAN KPB 6, KAWASAN PERI...   

                                   vendor_address_user        vendor_name_img  \
455  NO.2, JAALN TEMENGGUNG 1/9, SEKSYEN 9, BANDARM...  RESTORA

12it [00:37,  2.87s/it]

match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
360  00d0230547657  00d0230547657          0.0          9.095043   
162  00d0408017404  00d0230547657          1.0          9.095043   
229  00d0115214912  00d0230547657          0.0          9.095043   

     vendor_address_dist  date_dist amount_img  amount_user    date_img  \
360             1.558674   7.483819      16.00         16.0  15/11/2017   
162             1.558674   7.849261      15.00         16.0  15/06/2017   
229             1.558674   9.197028      16.00         16.0  10/05/2017   

      date_user                                 vendor_address_img  \
360  2017-11-15  NO. 17-G, JALAN SETIA INDAH (X) U13/X, SETIA A...   
162  2017-11-15  NO. 17-G, JALAN SETIA INDAH (X) U13/X, SETIA A...   
229  2017-11-15  NO. 17-G, JALAN SETIA INDAH (X) U13/X, SETIA A...   

                                   vendor_address_user  \
360  NO. 17-G, JALAN SETIA NIDAH (X) U13/X, SETIA A...   
162  NO. 17-G, JALAN SE

13it [00:40,  2.85s/it]

no match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
66   00d0477553790  00d0859401532          0.5         20.276999   
558  00d0763327632  00d0859401532          7.5         17.416346   
294  00d0958758313  00d0859401532          6.5         17.948719   

     vendor_address_dist  date_dist amount_img  amount_user     date_img  \
66              8.915810  10.616354     148.00        148.5   15/07/2017   
558             8.362834   8.696175     156.00        148.5   14/04/2018   
294             8.203334  11.338922     142.00        148.5  01-NOV-2017   

     date_user                                 vendor_address_img  \
66   2018-6-15  NO.9, JALAN MANIS 3, TAMAN SEGAR CHERAS, 56100...   
558  2018-6-15  NO: G3, BLK G, JLN PJU 1A/3, ARA DAMANSARA, 47...   
294  2018-6-15  LOT 204 & 205, JALAN BATU 12 1/4, KG. DUSUN NA...   

                                   vendor_address_user  \
66   NO.1, TAMAN SRI DENGKIL, JALAN AIR HITAM 43800...   
558  NO.1, TAMAN SRI

14it [00:43,  2.94s/it]

no match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
399  00d0841330621  00d0343958098          1.9          7.973497   
113  00d0821795963  00d0343958098          4.2          8.515497   
269  00d0471481845  00d0343958098          0.0         10.114377   

     vendor_address_dist  date_dist amount_img  amount_user     date_img  \
399            10.093143   9.765615      13.50         15.4   06/02/2018   
113             7.127918  11.205272      11.20         15.4  05 MAY 2018   
269            10.956684  10.000904      15.40         15.4     19-04-18   

    date_user                                 vendor_address_img  \
399  2018-9-6  NO.17, 18 & 41, JALAN BESAR, 39100 BRINCHANG, ...   
113  2018-9-6  12, JALAN TAMPOI 7/4,KAWASAN PERINDUSTRIAN TAM...   
269  2018-9-6  LOT 1851-A & 1851-B, JALAN KPB 6, KAWASAN PERI...   

                                   vendor_address_user  \
399  NO.1, JALAN PERMAS 10/5, BANDAR BARU BERMAS JA...   
113  NO.1, JALAN PERMAS 

15it [00:46,  3.06s/it]

match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
444  00d0676711962  00d0676711962         3.00          3.949599   
563  00d0726523245  00d0676711962         1.25          6.133041   
184  00d0186887844  00d0676711962         1.85          6.133041   

     vendor_address_dist  date_dist amount_img  amount_user  date_img  \
444             0.848652   7.162120      27.25        24.25  30-06-18   
563             0.848652  10.071784      23.00        24.25  27-04-18   
184             0.848652  10.385761      26.10        24.25  23-03-18   

     date_user                                 vendor_address_img  \
444  2018-6-30  LOT 1851-A & 1851-B, JALAN KPB 6, KAWASAN PERI...   
563  2018-6-30  LOT 1851-A & 1851-B, JALAN KPB 6, KAWASAN PERI...   
184  2018-6-30  LOT 1851-A & 1851-B, JALAN KPB 6, KAWASAN PERI...   

                                   vendor_address_user  \
444  LOT 1851-A & 1851-B, JALAN KPB 6, KAWASAN PERI...   
563  LOT 1851-A & 1851-B, JALAN KPB

16it [00:49,  2.98s/it]

no match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
372  00d0916574976  00d0114666542         0.79         13.035134   
524  00d0116788763  00d0114666542         0.81         13.035134   
127  00d0340785534  00d0114666542         1.71         12.860557   

     vendor_address_dist  date_dist amount_img  amount_user  date_img  \
372             8.430547  11.150131      10.60         9.81  18-03-18   
524             8.430547  11.722889       9.00         9.81  11-04-18   
127             7.988399  11.442308       8.10         9.81  17-06-18   

     date_user                                 vendor_address_img  \
372  20518-1-6  LOT 1851-A & 1851-B, JALAN KPB 6 , KAWASAN PER...   
524  20518-1-6  LOT 1851-A & 1851-B, JALAN KPB 6, KAWASAN PERI...   
127  20518-1-6  LOT 1851-A & 1851-B, JALAN KP8 6, KAWASAN PERI...   

                                   vendor_address_user       vendor_name_img  \
372  NO.2, JALANTEMENGGUNG 79/9, SEKSYEN 9, BANDAR ...    D.I.Y. (M)

17it [00:52,  2.92s/it]

no match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
128  00d0340785534  00d0507143553         1.50         11.258922   
410  00d0389135415  00d0507143553         1.60         11.570576   
447  00d0630325043  00d0507143553         3.65         11.104382   

     vendor_address_dist  date_dist amount_img  amount_user    date_img  \
128            10.263198  10.381524      41.90         43.4    17-06-18   
410            10.421058  10.054461      45.00         43.4    12-03-18   
447            10.501796   9.301120      39.75         43.4  02/01/2018   

    date_user                                 vendor_address_img  \
128  2018-1-6  LOT 1851-A & 1851-B, JALAN KP8 6, KAWASAN PERI...   
410  2018-1-6  LOT 1851-A & 1851-B, JALAN KPB 6, KAWASAN PERI...   
447  2018-1-6  NO 290, JALAN AIR PANAS. SETAPAK. 53200, KUALA...   

                                   vendor_address_user  \
128  NO.17,JALAN PERMAS 1/7, BANDAR BARU PERMAS JAY...   
410  NO.17,JALAN PERMAS 1/7,

18it [00:55,  3.03s/it]

no match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
39   00d0921867955  00d0622420051          1.2          7.389210   
53   00d0240488407  00d0622420051          1.2          7.650956   
443  00d0396141876  00d0622420051          0.4         10.605414   

     vendor_address_dist  date_dist amount_img  amount_user    date_img  \
39             11.039983  10.444280      34.80         33.6    19-04-18   
53             11.039983  10.444280      34.80         33.6    19-04-18   
443             9.193432  10.713177      34.00         33.6  10/03/2018   

      date_user                                 vendor_address_img  \
39   2018-212-6  LOT 1851-A & 1851-B, JALAN KPB 6, KAWASAN PERI...   
53   2018-212-6  LOT 1851-A & 1851-B, JALAN KPB 6, KAWASAN PERI...   
443  2018-212-6  NO: G3, BLK G, JLN PJU 1A/3, ARA DAMANSARA, 47...   

                                   vendor_address_user  \
39   3RD FLR, AEON TAMAN MALURI SC JLN JEJAKA, TAMA...   
53   3RD FLR, AEON T

19it [00:58,  2.97s/it]

no match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
298  00d0238897436  00d0831446713          0.0         12.595516   
267  00d0484942812  00d0831446713          1.0         15.752798   
536  00d0828995688  00d0831446713          2.0         14.511075   

     vendor_address_dist  date_dist amount_img  amount_user    date_img  \
298             7.779684   9.333439     100.00        100.0  12/01/2018   
267             7.114766   9.770734      99.00        100.0  04/12/2017   
536             9.415433   9.532419     102.00        100.0  20/03/2018   

     date_user                                 vendor_address_img  \
298  2018-6-26  NO 18, 20, 22, JALAN BUNGA TANJONG 2/16, 40000...   
267  2018-6-26  NO 31G, JALAN SEPADU C 25/C. SECTION 25, TAMAN...   
536  2018-6-26  19, JALAN KANCIL, OFF JALAN PUDU, 55100 KUALA ...   

                                   vendor_address_user  \
298  LOT PT 1138 ,PT 33122, BANDR MAHKOTACHERAS 432...   
267  LOT PT 1138 ,PT 331

20it [01:01,  2.92s/it]

match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
446  00d0472127340  00d0472127340         0.00          5.747207   
70   00d0924594862  00d0472127340         1.80          6.048920   
226  00d0760304510  00d0472127340         0.35          6.048920   

     vendor_address_dist  date_dist amount_img  amount_user  date_img  \
446             1.939196   7.320674       8.95         8.95  15-06-18   
70              1.939196   7.347870       7.15         8.95  15-07-18   
226             1.939196   9.900601       8.60         8.95  06-04-16   

     date_user                                 vendor_address_img  \
446  2018-6-15  LOT 1851-A & 1851-B, JALAN KPB 6, KAWASAN PERI...   
70   2018-6-15  LOT 1851-A & 1851-B, JALAN KPB 6, KAWASAN PERI...   
226  2018-6-15  LOT 1851 -A & 1851-B, JALAN KPB 6, KAWASAN PER...   

                                   vendor_address_user  \
446  LOT 1851-A & 1851-B, JALAN KPB 6, KAWASANPSRIN...   
70   LOT 1851-A & 1851-B, JALAN KPB

21it [01:04,  2.95s/it]

match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
218  00d0164203891  00d0164203891         0.00          4.958441   
400  00d0599423844  00d0164203891         2.65         13.283631   
146  00d0504725741  00d0164203891        17.65          7.429406   

     vendor_address_dist  date_dist amount_img  amount_user    date_img  \
218             0.000008   8.778734     127.35       127.35  10/06/2018   
400             8.598777   9.850096     130.00       127.35  27/03/2018   
146            10.072710   8.981348     109.70       127.35  06/02/2018   

     date_user                                 vendor_address_img  \
218  2018-10-6  3RD FLR, AEON TAMAN MALURI SC JLN JEJAKA, TAMA...   
400  2018-10-6  19, JALAN KANCIL, OFF JALAN PUDU, 55100 KUALA ...   
146  2018-10-6  NO.17,18 & 41,JALAN BESAR, 39100 BRINCHANG, CA...   

                                   vendor_address_user  \
218  3RD FLR, AEON TAMAN MALURI SC JLN JEJAKA, TAMA...   
400  3RD FLR, AEON TAMAN MA

22it [01:07,  3.03s/it]

no match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
365  00d0277307035  00d0343123955          1.0          5.618582   
254  00d0950145735  00d0343123955          6.0          5.618582   
255  00d0950145735  00d0343123955          6.0          5.618582   

     vendor_address_dist  date_dist amount_img  amount_user    date_img  \
365              1.38156   7.919007      59.00         58.0  14-06-2018   
254              1.38156   8.099582      52.00         58.0  05-06-2018   
255              1.38156   8.099582      52.00         58.0  05-06-2018   

     date_user                                 vendor_address_img  \
365  2018-6-17  NO.1, TAMAN SRI DENGKIL, JALAN AIR HITAM 43800...   
254  2018-6-17  NO.1, TAMAN SRI DENGKIL, JALAN AIR HITAM 43800...   
255  2018-6-17  NO.1, TAMAN SRI DENGKIL, JALAN AIR HITAM 43800...   

                                   vendor_address_user  \
365  NO.1, TAMAN SRIDE NGKIL, JALAN AIR HITAM 43800...   
254  NO.1, TAMAN SRIDE N

23it [01:10,  3.06s/it]

match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
316  00d0714317145  00d0714317145          0.0         10.314688   
307  00d0552375573  00d0714317145          2.0         10.314688   
562  00d0269258457  00d0714317145          6.0         10.314688   

     vendor_address_dist  date_dist amount_img  amount_user    date_img  \
316             1.380544   8.028092      30.00         30.0  12/09/2017   
307             1.380544   8.991290      28.00         30.0  04/09/2017   
562             1.380544   9.432460      24.00         30.0  15/09/2017   

     date_user                                 vendor_address_img  \
316  2017-12-9  NO. 17-G, JALAN SETIA INDAH (X) U13/X, SETIA A...   
307  2017-12-9  NO. 17-G, JALAN SETIA INDAH (X) U13/X, SETIA A...   
562  2017-12-9  NO. 17-G, JALAN SETIA INDAH (X) U13/X, SETIA A...   

                                   vendor_address_user  \
316  NO. 17-G, JALAN SEIIA INDAH (X) U13/X, SETIA A...   
307  NO. 17-G, JALAN SEIIA 

24it [01:13,  3.08s/it]

no match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
166  00d0192025858  00d0443302756         0.09         13.031042   
436  00d0449236994  00d0443302756         2.00         10.515183   
328  00d0810901932  00d0443302756         0.40         14.170156   

     vendor_address_dist  date_dist amount_img  amount_user    date_img  \
166             8.497528  12.575989      19.31         19.4  05/08/2017   
436             9.017962  13.000319      21.40         19.4  10/08/2017   
328             6.810952  13.620086      19.80         19.4    19/01/17   

    date_user                                 vendor_address_img  \
166  2018-5-5  LOT 3, JALAN PELABUR 23/1, 40300 SHAH ALAM, SE...   
436  2018-5-5  4-20, JALAN RIA 25/62 TAMAN SRI MUDA SEKSYEN 2...   
328  2018-5-5  G.23 & G.22, PLAZA SERI SETIA, NO.1 JALAN SS 9...   

                                   vendor_address_user  \
166  NO.2, JALAN TEMENGUGNG I9/, SEKSYEN 9, BANDAR ...   
436  NO.2, JALAN TEMENGUGNG 

25it [01:16,  2.99s/it]

no match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
238  00d0922342206  00d0484705589         0.25          8.442304   
354  00d0948775947  00d0484705589         0.25          9.521691   
474  00d0106074517  00d0484705589         1.55          8.667381   

     vendor_address_dist  date_dist amount_img  amount_user    date_img  \
238             8.865184   9.034190       3.20         3.45  04/03/2018   
354            10.144777   9.773158       3.70         3.45  20/04/2018   
474            11.077158  10.073827       5.00         3.45    30-04-18   

    date_user                                 vendor_address_img  \
238  2018-3-7  NO 19-G& 19-1& 19-2 JALAN TASIK UTAMA 4, MEDAN...   
354  2018-3-7  16A, JALAN ASTAKA U8/83, BUKIT JELUTONG 40150 ...   
474  2018-3-7  LOT 1851-A & 1851-B, JALAN KPB 6, KAWASAN PERI...   

                                   vendor_address_user  \
238  NO 24, 25 & 26, JALAN PUSAT PERNIAGAAN BUNGA R...   
354  NO 24, 25 & 26, JALAN P

26it [01:19,  3.02s/it]

no match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
354  00d0948775947  00d0415751725          1.3          7.291899   
474  00d0106074517  00d0415751725          0.0          9.569342   
510  00d0685723809  00d0415751725          4.5          8.194295   

     vendor_address_dist  date_dist amount_img  amount_user    date_img  \
354            11.380821   9.269927       3.70          5.0  20/04/2018   
474             9.673655  10.412956       5.00          5.0    30-04-18   
510             8.910763   8.539619       9.50          5.0  19-04-2018   

     date_user                                 vendor_address_img  \
354  2018-6-25  16A, JALAN ASTAKA U8/83, BUKIT JELUTONG 40150 ...   
474  2018-6-25  LOT 1851-A & 1851-B, JALAN KPB 6, KAWASAN PERI...   
510  2018-6-25     LOT-18A-2, BERJAYA TIMES SQUARE, KUALA LUMPUR.   

                                   vendor_address_user  \
354  MEENARA DION #02-03, LEVEL 2, 2, JALANSULATN I...   
474  MEENARA DION #02-03

27it [01:22,  3.00s/it]

no match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
455  00d0243521534  00d0726056883          1.1          9.871409   
456  00d0243521534  00d0726056883          1.1          9.871409   
238  00d0922342206  00d0726056883          1.0         12.091424   

     vendor_address_dist  date_dist amount_img  amount_user    date_img  \
455             9.329500   9.999015       5.30          4.2  19/03/2018   
456             9.329500   9.999015       5.30          4.2  19/03/2018   
238             8.281724   9.503946       3.20          4.2  04/03/2018   

     date_user                                 vendor_address_img  \
455  2018-8-26  NO.2, GROUND FLOOR, JALAN REKO SENTRAL 8 TAMAN...   
456  2018-8-26  NO.2, GROUND FLOOR, JALAN REKO SENTRAL 8 TAMAN...   
238  2018-8-26  NO 19-G& 19-1& 19-2 JALAN TASIK UTAMA 4, MEDAN...   

                                   vendor_address_user        vendor_name_img  \
455  NO.2, JALAN TEMENGGUNG 19/9, SEKSYEN 9, BANDAR...  RES

28it [01:25,  2.94s/it]

no match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
540  00d0320901819  00d0916006350          4.6          9.092237   
93   00d0375035482  00d0916006350          6.4         10.542738   
448  00d0177909385  00d0916006350          2.3         15.135360   

     vendor_address_dist  date_dist amount_img  amount_user    date_img  \
540             7.998228  10.140409      52.80         57.4    19/05/18   
93              7.603086   9.223762      51.00         57.4  09/04/2018   
448             8.195482   8.763869      55.10         57.4  15/01/2018   

     date_user                                 vendor_address_img  \
540  2018-6-24  JALAN TRAS BATU 1 28700 BENTONG PAHANG DARUL M...   
93   2018-6-24  NO: G3, BLK G, JLN PJU 1A/3, ARA DAMANSARA, 47...   
448  2018-6-24  NO. 53, JALAN BESAR, 45600 BATANG BERJUNTAI SE...   

                                   vendor_address_user  \
540  NO.29M, JALAN DINAR G U3/G, SEKSYEN U3, SUBANG...   
93   NO.29M, JALAN DINAR

29it [01:28,  2.90s/it]

no match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
79   00d0955195431  00d0660852423         1.91         13.862664   
71   00d0608126101  00d0660852423         2.60         16.617931   
354  00d0948775947  00d0660852423         2.40         15.395177   

     vendor_address_dist  date_dist amount_img  amount_user    date_img  \
79              8.848753  11.208269       3.21          1.3  16/09/2017   
71              6.736193  10.508140       3.90          1.3    17/04/18   
354             9.056889   9.759012       3.70          1.3  20/04/2018   

    date_user                                 vendor_address_img  \
79   2018-6-1  LOT 3, JALAN PELABUR 23/1, 40300 SHAH ALAM, SE...   
71   2018-6-1  NO.12, JALAN PERMAS JAYA 10, BANDAR BARU PERMA...   
354  2018-6-1  16A, JALAN ASTAKA U8/83, BUKIT JELUTONG 40150 ...   

                                   vendor_address_user  \
79   NO 3, JALAN  PERMAS 10/8, BANDAR BARU PERMAS J...   
71   NO 3, JALAN  PERMAS 10/

30it [01:31,  2.88s/it]

no match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
184  00d0186887844  00d0811301203         1.40          8.156208   
380  00d0472913563  00d0811301203         0.30          9.177492   
398  00d0582732557  00d0811301203         2.09          7.550385   

     vendor_address_dist  date_dist amount_img  amount_user    date_img  \
184             9.782003  11.317154      26.10         24.7    23-03-18   
380            10.867164  10.473670      24.40         24.7  07/05/2018   
398            10.235202  11.215338      26.79         24.7  25/09/2017   

    date_user                                 vendor_address_img  \
184  2018-9-5  LOT 1851-A & 1851-B, JALAN KPB 6, KAWASAN PERI...   
380  2018-9-5  JALAN PERMAS UTARA 1, PERMAS JAYA 81750 MASAI ...   
398  2018-9-5  NO. 76, JALAN SS15/4B, 47500 SUBANG JAYA, SELA...   

                                   vendor_address_user  \
184  3-7, GROUND FLOOR, JLN PURTA DENGKIL 1 TMN PUT...   
380  3-7, GROUND FLOOR, JLN 

31it [01:33,  2.84s/it]

no match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
455  00d0243521534  00d0741622526          0.3          9.871409   
456  00d0243521534  00d0741622526          0.3          9.871409   
474  00d0106074517  00d0741622526          0.0         13.035134   

     vendor_address_dist  date_dist amount_img  amount_user    date_img  \
455             8.288689  12.049611       5.30          5.0  19/03/2018   
456             8.288689  12.049611       5.30          5.0  19/03/2018   
474             8.268308  13.282076       5.00          5.0    30-04-18   

    date_user                                 vendor_address_img  \
455  2018-6-5  NO.2, GROUND FLOOR, JALAN REKO SENTRAL 8 TAMAN...   
456  2018-6-5  NO.2, GROUND FLOOR, JALAN REKO SENTRAL 8 TAMAN...   
474  2018-6-5  LOT 1851-A & 1851-B, JALAN KPB 6, KAWASAN PERI...   

                                   vendor_address_user        vendor_name_img  \
455  NO.2, JALAN TEMENGGUNG 19/9, SEKSGYEN 9, BANDA...  RESTORA

32it [01:36,  2.87s/it]

match
      image_doc_id    user_doc_id  amount_dist  vendor_name_dist  \
342  00d0204688228  00d0204688228          0.0          9.095043   
149  00d0564199588  00d0204688228          1.5          9.095043   
468  00d0923796657  00d0204688228          2.0          9.095043   

     vendor_address_dist  date_dist amount_img  amount_user    date_img  \
342             1.008541   7.251463       4.00          4.0  13/07/2017   
149             1.008541   9.302861       2.50          4.0  25/12/2017   
468             1.008541   9.403475       6.00          4.0  30/03/2017   

     date_user                                 vendor_address_img  \
342  2017-7-13  NO. 17-G, JALAN SETIA INDAH (X) U13/X, SETIA A...   
149  2017-7-13  NO. 17-G, JALAN SETIA INDAH (X) U13/X, SETIA A...   
468  2017-7-13  NO. 17-G, JALAN SETIA INDAH (X) U13/X, SETIA A...   

                                   vendor_address_user  \
342  NO. 17-G, JALAN SETIA INDAH (X) U13/X, SETIA A...   
149  NO. 17-G, JALAN SETIA 

33it [01:39,  2.84s/it]

In [79]:
matchs(test.iloc[9]).to_csv("Model_Example.csv")