In [1]:
import joblib
import traceback
import os
import pandas as pd

In [8]:
model_path = "/Users/ssingh355/code/invoice_processing/ip_classifier/models/2021-07-22 19:25:32.737538"

## testing model trained with new class Teleperformance USA on test set

In [9]:
def return_categoy(predictions, labels):
    testing_predictions = []
    for i in range(len(predictions)):
        labelidx = predictions[i]
        predicted_label = labels[labelidx]
        testing_predictions.append(predicted_label)
    return testing_predictions

In [10]:
def load_model(model_path):
    model_file_path = model_path + '/' + 'model.jbl'
    vectorizer_path = model_path + '/' + 'vectorizer.jbl'
    transformer_path = model_path + '/' + 'transformer.jbl'
    label_file = model_path + '/' + 'labels.txt'
    ## Load model here
    model = joblib.load(model_file_path)
    vectorizer = joblib.load(vectorizer_path)
    transformer = joblib.load(transformer_path)
    print("Models loaded")
    with open(label_file, 'r') as labelfile:
        labels = labelfile.read().splitlines()
    text_model = {
        "model": model,
        "vectorizer": vectorizer,
        "transformer": transformer,
        "labels": labels
    }
#         loaded_models[model_name] = text_model
    return text_model

In [11]:
def classify_text(text, model_dict):
    """

    :param text:
    :return:
    """

#     text_bytes = base64.b64decode(text_base64)
#     text = text_bytes.decode('ascii')
    text = [text]

    try:
        vectorizer = model_dict["vectorizer"]
        transformer = model_dict["transformer"]
        model = model_dict["model"]
        labels = model_dict["labels"]
    except:
        tb = traceback.format_exc()
        return ["MODEL_FETCH_EXCEPTION" + "\n" + tb]

    try:
        # Convert to bag of words
        X = vectorizer.transform(text)
        # Convert from occurrences to frequencies
        X = transformer.transform(X)
    except:
        tb = traceback.format_exc()
        return ["VECTORIZING_EXCEPTION" + "\n" + tb]

    try:
        predictions = model.predict(X)
        conf_score = model.predict_proba(X).max()
    except:
        tb = traceback.format_exc()
        return ["PREDICTION_EXCEPTION" + "\n" + tb]

    try:
        result = return_categoy(predictions, labels)
    except:
        tb = traceback.format_exc()
        return ["LABEL_EXTRACTION_EXCEPTION" + "\n" + tb]

    success_msg = {"predicted_doctype": str(result[0]), "conf_score": conf_score}

    return [200, success_msg]

In [12]:
model_dict = load_model(model_path)

Models loaded


In [13]:
DATASET_INP_PATH = "/Users/ssingh355/code/invoice_processing/ip_classifier/imgs_filtered"

In [14]:
dirs = os.listdir(DATASET_INP_PATH)

In [15]:
dirs

['Dell Financial Services-V1',
 'FLEXI PERSONNEL LTD-English-V1',
 '.DS_Store',
 '1011656 - FACEBOOK NETHERLANDS BV-English-V1',
 'Foodee Media US Inc-V1',
 'Taxback International-V2',
 'CANON INDIA PVT LTD-V1',
 'TORPEDO MARKETING INC-V1',
 'CDW Canada Corp-V1',
 'The Siegfried Group LLC-V2',
 'Facebook Inc-V1',
 '911 Mobile Mechanic-V2',
 'Manning Gottlieb OMD-English-V2',
 'PROFESSIONAL COMMERCIALS-V1',
 'Randstad RiseSmart-V1',
 'WIPRO LIMITED-V1',
 'MoreDirect Inc dba Connection-V1',
 'Other-Document',
 'CONCENTRIX CORPORATION US-V1',
 'BHARTI AIRTEL LIMITED-V1',
 'TELEPERFORMANCE COLOMBIA S.A.S.-V1',
 'FOXBOX RETAIL PRIVATE LIMITED-V1',
 'Tata Communications Limited-V1',
 'Taxback International-V1',
 'ARKPHIRE IRELAND LIMITED-English-V1',
 'Teleperformance USA',
 'A. G. Adjustments-V1',
 'Hewlett Packard Financial Services Company-V1',
 'DE LAGE LANDEN FINANCIAL SERVICES-V1',
 'Google LLC',
 '911 Mobile Mechanic-V1',
 'CONQUER TECHNOLOGIES-V1',
 'WIPRO LIMITED-V2',
 'Snappy App I

In [16]:
len(dirs)

34

In [17]:
dataset = []
for class_folder in dirs:
    if class_folder != ".DS_Store":
#         label_list.append(class_folder)
        class_folder_path = os.path.join(DATASET_INP_PATH, class_folder, 'test')
        class_files = os.listdir(class_folder_path)
        for file in class_files:
            if file != ".DS_Store":
                fp = os.path.join(class_folder_path, file)
                with open(fp, 'r', encoding='utf-8') as f:
                    raw_text = f.read()
                data = [fp, raw_text, class_folder]
                dataset.append(data)

In [18]:
len(dataset)

668

In [19]:
df = pd.DataFrame(dataset)
df

Unnamed: 0,0,1,2
0,/Users/ssingh355/code/invoice_processing/ip_cl...,Dell\nFinancial\nServices\nwww.DellFinancialSe...,Dell Financial Services-V1
1,/Users/ssingh355/code/invoice_processing/ip_cl...,Dell\nFinancial\nServices\nwww.DellFinancialSe...,Dell Financial Services-V1
2,/Users/ssingh355/code/invoice_processing/ip_cl...,Dell\nFinancial\nServices\nwww.DellFinancialSe...,Dell Financial Services-V1
3,/Users/ssingh355/code/invoice_processing/ip_cl...,Dell\nFinancial\nServices\nwww.DellFinancialSe...,Dell Financial Services-V1
4,/Users/ssingh355/code/invoice_processing/ip_cl...,Dell\nFinancial\nServices\nwww.DellFinancialSe...,Dell Financial Services-V1
...,...,...,...
663,/Users/ssingh355/code/invoice_processing/ip_cl...,"Snappy App, Inc.\nInvoice\n125 5th Ave, Floor ...",Snappy App Inc-V1
664,/Users/ssingh355/code/invoice_processing/ip_cl...,"Snappy App, Inc.\nInvoice\n125 5th Ave, Floor ...",Snappy App Inc-V1
665,/Users/ssingh355/code/invoice_processing/ip_cl...,"Snappy App, Inc.\nInvoice\n125 5th Ave, Floor ...",Snappy App Inc-V1
666,/Users/ssingh355/code/invoice_processing/ip_cl...,"Snappy App, Inc.\nInvoice\n125 5th Ave, Floor ...",Snappy App Inc-V1


In [20]:
# Import label encoder
from sklearn import preprocessing

In [21]:
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
  
# Encode labels in column 'species'.
df['labels']= label_encoder.fit_transform(df[2])
  
df['labels'].unique()

array([11, 12,  0, 15, 28,  6, 25,  7, 30, 14,  2, 18, 21, 22, 31, 19, 20,
        8,  5, 24, 13, 26, 27,  4, 29,  3, 17, 10, 16,  1,  9, 32, 23])

In [22]:
df

Unnamed: 0,0,1,2,labels
0,/Users/ssingh355/code/invoice_processing/ip_cl...,Dell\nFinancial\nServices\nwww.DellFinancialSe...,Dell Financial Services-V1,11
1,/Users/ssingh355/code/invoice_processing/ip_cl...,Dell\nFinancial\nServices\nwww.DellFinancialSe...,Dell Financial Services-V1,11
2,/Users/ssingh355/code/invoice_processing/ip_cl...,Dell\nFinancial\nServices\nwww.DellFinancialSe...,Dell Financial Services-V1,11
3,/Users/ssingh355/code/invoice_processing/ip_cl...,Dell\nFinancial\nServices\nwww.DellFinancialSe...,Dell Financial Services-V1,11
4,/Users/ssingh355/code/invoice_processing/ip_cl...,Dell\nFinancial\nServices\nwww.DellFinancialSe...,Dell Financial Services-V1,11
...,...,...,...,...
663,/Users/ssingh355/code/invoice_processing/ip_cl...,"Snappy App, Inc.\nInvoice\n125 5th Ave, Floor ...",Snappy App Inc-V1,23
664,/Users/ssingh355/code/invoice_processing/ip_cl...,"Snappy App, Inc.\nInvoice\n125 5th Ave, Floor ...",Snappy App Inc-V1,23
665,/Users/ssingh355/code/invoice_processing/ip_cl...,"Snappy App, Inc.\nInvoice\n125 5th Ave, Floor ...",Snappy App Inc-V1,23
666,/Users/ssingh355/code/invoice_processing/ip_cl...,"Snappy App, Inc.\nInvoice\n125 5th Ave, Floor ...",Snappy App Inc-V1,23


In [23]:
df.loc[df['labels'] == 13]

Unnamed: 0,0,1,2,labels
496,/Users/ssingh355/code/invoice_processing/ip_cl...,Tax Invoice\nFOXBOX RETAIL PRIVATE LIMITED\nIn...,FOXBOX RETAIL PRIVATE LIMITED-V1,13
497,/Users/ssingh355/code/invoice_processing/ip_cl...,Tax Invoice\nFOXBOX RETAIL PRIVATE LIMITED\nIn...,FOXBOX RETAIL PRIVATE LIMITED-V1,13
498,/Users/ssingh355/code/invoice_processing/ip_cl...,Tax Invoice\nFOXBOX RETAIL PRIVATE LIMITED\nIn...,FOXBOX RETAIL PRIVATE LIMITED-V1,13
499,/Users/ssingh355/code/invoice_processing/ip_cl...,Tax Invoice\nFOXBOX RETAIL PRIVATE LIMITED\nIn...,FOXBOX RETAIL PRIVATE LIMITED-V1,13
500,/Users/ssingh355/code/invoice_processing/ip_cl...,Tax Invoice\nFOXBOX RETAIL PRIVATE LIMITED\nIn...,FOXBOX RETAIL PRIVATE LIMITED-V1,13


In [24]:
predicted_results = []
for index, row in df.iterrows():
    raw_text = row[1]
    result = classify_text(raw_text, model_dict)
    if result[0] != 200:
        predicted_result = result
    else:
        predicted_result = result[1]
    predicted_results.append(predicted_result)

In [25]:
predicted_results

[{'predicted_doctype': 'Dell Financial Services-V1',
  'conf_score': 0.9999058246784568},
 {'predicted_doctype': 'Dell Financial Services-V1',
  'conf_score': 0.9999058246784568},
 {'predicted_doctype': 'Dell Financial Services-V1',
  'conf_score': 0.9999058246784568},
 {'predicted_doctype': 'Dell Financial Services-V1',
  'conf_score': 0.9999058246784568},
 {'predicted_doctype': 'Dell Financial Services-V1',
  'conf_score': 0.9999058246784568},
 {'predicted_doctype': 'Dell Financial Services-V1',
  'conf_score': 0.9999058246784568},
 {'predicted_doctype': 'Dell Financial Services-V1',
  'conf_score': 0.9999058246784568},
 {'predicted_doctype': 'Dell Financial Services-V1',
  'conf_score': 0.9999058246784568},
 {'predicted_doctype': 'Dell Financial Services-V1',
  'conf_score': 0.9999058246784568},
 {'predicted_doctype': 'Dell Financial Services-V1',
  'conf_score': 0.9999058246784568},
 {'predicted_doctype': 'Dell Financial Services-V1',
  'conf_score': 0.9999058246784568},
 {'predict

In [19]:
predicted_results

[{'predicted_doctype': 'Dell Financial Services-V1',
  'conf_score': 0.9999448542884061},
 {'predicted_doctype': 'Dell Financial Services-V1',
  'conf_score': 0.9999448542884061},
 {'predicted_doctype': 'Dell Financial Services-V1',
  'conf_score': 0.9999448542884061},
 {'predicted_doctype': 'Dell Financial Services-V1',
  'conf_score': 0.9999448542884061},
 {'predicted_doctype': 'Dell Financial Services-V1',
  'conf_score': 0.9999448542884061},
 {'predicted_doctype': 'Dell Financial Services-V1',
  'conf_score': 0.9999448542884061},
 {'predicted_doctype': 'Dell Financial Services-V1',
  'conf_score': 0.9999448542884061},
 {'predicted_doctype': 'Dell Financial Services-V1',
  'conf_score': 0.9999448542884061},
 {'predicted_doctype': 'Dell Financial Services-V1',
  'conf_score': 0.9999448542884061},
 {'predicted_doctype': 'Dell Financial Services-V1',
  'conf_score': 0.9999448542884061},
 {'predicted_doctype': 'Dell Financial Services-V1',
  'conf_score': 0.9999448542884061},
 {'predict

In [26]:
df["predicted_results"] = predicted_results

In [27]:
df["predicted_label"] = [i['predicted_doctype'] for i in predicted_results]

In [28]:
df

Unnamed: 0,0,1,2,labels,predicted_results,predicted_label
0,/Users/ssingh355/code/invoice_processing/ip_cl...,Dell\nFinancial\nServices\nwww.DellFinancialSe...,Dell Financial Services-V1,11,{'predicted_doctype': 'Dell Financial Services...,Dell Financial Services-V1
1,/Users/ssingh355/code/invoice_processing/ip_cl...,Dell\nFinancial\nServices\nwww.DellFinancialSe...,Dell Financial Services-V1,11,{'predicted_doctype': 'Dell Financial Services...,Dell Financial Services-V1
2,/Users/ssingh355/code/invoice_processing/ip_cl...,Dell\nFinancial\nServices\nwww.DellFinancialSe...,Dell Financial Services-V1,11,{'predicted_doctype': 'Dell Financial Services...,Dell Financial Services-V1
3,/Users/ssingh355/code/invoice_processing/ip_cl...,Dell\nFinancial\nServices\nwww.DellFinancialSe...,Dell Financial Services-V1,11,{'predicted_doctype': 'Dell Financial Services...,Dell Financial Services-V1
4,/Users/ssingh355/code/invoice_processing/ip_cl...,Dell\nFinancial\nServices\nwww.DellFinancialSe...,Dell Financial Services-V1,11,{'predicted_doctype': 'Dell Financial Services...,Dell Financial Services-V1
...,...,...,...,...,...,...
663,/Users/ssingh355/code/invoice_processing/ip_cl...,"Snappy App, Inc.\nInvoice\n125 5th Ave, Floor ...",Snappy App Inc-V1,23,"{'predicted_doctype': 'Snappy App Inc-V1', 'co...",Snappy App Inc-V1
664,/Users/ssingh355/code/invoice_processing/ip_cl...,"Snappy App, Inc.\nInvoice\n125 5th Ave, Floor ...",Snappy App Inc-V1,23,"{'predicted_doctype': 'Snappy App Inc-V1', 'co...",Snappy App Inc-V1
665,/Users/ssingh355/code/invoice_processing/ip_cl...,"Snappy App, Inc.\nInvoice\n125 5th Ave, Floor ...",Snappy App Inc-V1,23,"{'predicted_doctype': 'Snappy App Inc-V1', 'co...",Snappy App Inc-V1
666,/Users/ssingh355/code/invoice_processing/ip_cl...,"Snappy App, Inc.\nInvoice\n125 5th Ave, Floor ...",Snappy App Inc-V1,23,"{'predicted_doctype': 'Snappy App Inc-V1', 'co...",Snappy App Inc-V1


In [40]:
df.loc[df[2]=='Teleperformance USA']

Unnamed: 0,0,1,2,labels,predicted_results,predicted_label,predicted_confidence
533,/Users/ssingh355/code/invoice_processing/ip_cl...,Please Remit Payment To:\nBy Wire\n' Teleperfo...,Teleperformance USA,29,"{'predicted_doctype': 'Teleperformance USA', '...",Teleperformance USA,0.923785
534,/Users/ssingh355/code/invoice_processing/ip_cl...,Please Remit Payment To:\nBy Wire\n' Teleperfo...,Teleperformance USA,29,"{'predicted_doctype': 'Teleperformance USA', '...",Teleperformance USA,0.903637
535,/Users/ssingh355/code/invoice_processing/ip_cl...,Please Remit Payment To:\nBy Wire\n' Teleperfo...,Teleperformance USA,29,"{'predicted_doctype': 'Teleperformance USA', '...",Teleperformance USA,0.923484
536,/Users/ssingh355/code/invoice_processing/ip_cl...,Please Remit Payment To:\nBy Wire\n' Teleperfo...,Teleperformance USA,29,"{'predicted_doctype': 'Teleperformance USA', '...",Teleperformance USA,0.923785
537,/Users/ssingh355/code/invoice_processing/ip_cl...,Please Remit Payment To:\nBy Wire\n' Teleperfo...,Teleperformance USA,29,"{'predicted_doctype': 'Teleperformance USA', '...",Teleperformance USA,0.923484


In [44]:
df.loc[df[2]=='Taxback International-V1']

Unnamed: 0,0,1,2,labels,predicted_results,predicted_label,predicted_confidence
508,/Users/ssingh355/code/invoice_processing/ip_cl...,Taxback\nInternational\nUber B.V.\nMeester Tre...,Taxback International-V1,27,{'predicted_doctype': 'Taxback International-V...,Taxback International-V1,0.603816
509,/Users/ssingh355/code/invoice_processing/ip_cl...,Taxback\nInternational\nUber B.V.\nMeester Tre...,Taxback International-V1,27,{'predicted_doctype': 'Taxback International-V...,Taxback International-V1,0.535016
510,/Users/ssingh355/code/invoice_processing/ip_cl...,Taxback\nInternational\nUber B.V.\nMeester Tre...,Taxback International-V1,27,{'predicted_doctype': 'Taxback International-V...,Taxback International-V1,0.568919
511,/Users/ssingh355/code/invoice_processing/ip_cl...,Taxback\nInternational\nUber B.V.\nMeester Tre...,Taxback International-V1,27,{'predicted_doctype': 'Taxback International-V...,Taxback International-V1,0.599889
512,/Users/ssingh355/code/invoice_processing/ip_cl...,Taxback\nInternational\nUber B.V.\nMeester Tre...,Taxback International-V1,27,{'predicted_doctype': 'Taxback International-V...,Taxback International-V1,0.591326
513,/Users/ssingh355/code/invoice_processing/ip_cl...,Taxback\nInternational\nUber B.V.\nMeester Tre...,Taxback International-V1,27,{'predicted_doctype': 'Taxback International-V...,Taxback International-V1,0.659853
514,/Users/ssingh355/code/invoice_processing/ip_cl...,Taxback\nInternational\nUber B.V.\nMeester Tre...,Taxback International-V1,27,{'predicted_doctype': 'Taxback International-V...,Taxback International-V1,0.600462
515,/Users/ssingh355/code/invoice_processing/ip_cl...,Taxback\nInternational\nUber B.V.\nMeester Tre...,Taxback International-V1,27,{'predicted_doctype': 'Taxback International-V...,Taxback International-V1,0.571778
516,/Users/ssingh355/code/invoice_processing/ip_cl...,Taxback\nInternational\nUber B.V.\nMeester Tre...,Taxback International-V1,27,{'predicted_doctype': 'Taxback International-V...,Taxback International-V1,0.591728
517,/Users/ssingh355/code/invoice_processing/ip_cl...,Taxback\nInternational\nUber B.V.\nMeester Tre...,Taxback International-V1,27,{'predicted_doctype': 'Taxback International-V...,Taxback International-V1,0.572365


In [29]:
df["predicted_confidence"] = [i['conf_score'] for i in predicted_results]

In [30]:
wrong_predictions = 0
for index, row in df.iterrows():
    if df.iloc[index,2] != df.loc[index,'predicted_label']:
        wrong_predictions+=1
#         print(os.path.basename(df.iloc[index,0]))
        print(df.iloc[index,2], df.loc[index,'predicted_label'], df.loc[index,'predicted_confidence'])

Other-Document Dell Financial Services-V1 0.21776043884043386
Other-Document Google LLC 0.2500700421673589
Other-Document CDW Canada Corp-V1 0.13556367960127289
Other-Document The Siegfried Group LLC-V2 0.09165156554007696
Other-Document CDW Canada Corp-V1 0.22352230489575767
Other-Document Google LLC 0.2526149341513185
911 Mobile Mechanic-V1 911 Mobile Mechanic-V2 0.6957958081259243


In [84]:
# wrong_predictions = 0
for index, row in df.iterrows():
    if df.iloc[index,2] == df.loc[index,'predicted_label'] and df.iloc[index,2]=='FLEXI PERSONNEL LTD-English-V2':
#         wrong_predictions+=1
#         print(os.path.basename(df.iloc[index,0]))
        print(df.iloc[index,2], df.loc[index,'predicted_label'], df.loc[index,'predicted_confidence'])

In [31]:
print("Accuracy = {}".format(1 - wrong_predictions/len(df['predicted_label'])))

Accuracy = 0.9895209580838323


In [32]:
df

Unnamed: 0,0,1,2,labels,predicted_results,predicted_label,predicted_confidence
0,/Users/ssingh355/code/invoice_processing/ip_cl...,Dell\nFinancial\nServices\nwww.DellFinancialSe...,Dell Financial Services-V1,11,{'predicted_doctype': 'Dell Financial Services...,Dell Financial Services-V1,0.999906
1,/Users/ssingh355/code/invoice_processing/ip_cl...,Dell\nFinancial\nServices\nwww.DellFinancialSe...,Dell Financial Services-V1,11,{'predicted_doctype': 'Dell Financial Services...,Dell Financial Services-V1,0.999906
2,/Users/ssingh355/code/invoice_processing/ip_cl...,Dell\nFinancial\nServices\nwww.DellFinancialSe...,Dell Financial Services-V1,11,{'predicted_doctype': 'Dell Financial Services...,Dell Financial Services-V1,0.999906
3,/Users/ssingh355/code/invoice_processing/ip_cl...,Dell\nFinancial\nServices\nwww.DellFinancialSe...,Dell Financial Services-V1,11,{'predicted_doctype': 'Dell Financial Services...,Dell Financial Services-V1,0.999906
4,/Users/ssingh355/code/invoice_processing/ip_cl...,Dell\nFinancial\nServices\nwww.DellFinancialSe...,Dell Financial Services-V1,11,{'predicted_doctype': 'Dell Financial Services...,Dell Financial Services-V1,0.999906
...,...,...,...,...,...,...,...
663,/Users/ssingh355/code/invoice_processing/ip_cl...,"Snappy App, Inc.\nInvoice\n125 5th Ave, Floor ...",Snappy App Inc-V1,23,"{'predicted_doctype': 'Snappy App Inc-V1', 'co...",Snappy App Inc-V1,0.987002
664,/Users/ssingh355/code/invoice_processing/ip_cl...,"Snappy App, Inc.\nInvoice\n125 5th Ave, Floor ...",Snappy App Inc-V1,23,"{'predicted_doctype': 'Snappy App Inc-V1', 'co...",Snappy App Inc-V1,0.979889
665,/Users/ssingh355/code/invoice_processing/ip_cl...,"Snappy App, Inc.\nInvoice\n125 5th Ave, Floor ...",Snappy App Inc-V1,23,"{'predicted_doctype': 'Snappy App Inc-V1', 'co...",Snappy App Inc-V1,0.970303
666,/Users/ssingh355/code/invoice_processing/ip_cl...,"Snappy App, Inc.\nInvoice\n125 5th Ave, Floor ...",Snappy App Inc-V1,23,"{'predicted_doctype': 'Snappy App Inc-V1', 'co...",Snappy App Inc-V1,0.973358


In [34]:
df_low_confidence = df.loc[df['predicted_confidence'] <= 0.65]

In [35]:
len(df_low_confidence)

26

In [36]:
df_low_confidence

Unnamed: 0,0,1,2,labels,predicted_results,predicted_label,predicted_confidence
29,/Users/ssingh355/code/invoice_processing/ip_cl...,Facebook Netherlands B.V.\nINVOICE\nParkhuis A...,1011656 - FACEBOOK NETHERLANDS BV-English-V1,0,{'predicted_doctype': '1011656 - FACEBOOK NETH...,1011656 - FACEBOOK NETHERLANDS BV-English-V1,0.555134
30,/Users/ssingh355/code/invoice_processing/ip_cl...,Facebook Netherlands B.V.\nINVOICE\nParkhuis A...,1011656 - FACEBOOK NETHERLANDS BV-English-V1,0,{'predicted_doctype': '1011656 - FACEBOOK NETH...,1011656 - FACEBOOK NETHERLANDS BV-English-V1,0.0989
31,/Users/ssingh355/code/invoice_processing/ip_cl...,Facebook Netherlands B.V.\nINVOICE\nParkhuis A...,1011656 - FACEBOOK NETHERLANDS BV-English-V1,0,{'predicted_doctype': '1011656 - FACEBOOK NETH...,1011656 - FACEBOOK NETHERLANDS BV-English-V1,0.392293
32,/Users/ssingh355/code/invoice_processing/ip_cl...,Facebook Netherlands B.V.\nParkhuis Amsterdam\...,1011656 - FACEBOOK NETHERLANDS BV-English-V1,0,{'predicted_doctype': '1011656 - FACEBOOK NETH...,1011656 - FACEBOOK NETHERLANDS BV-English-V1,0.536009
132,/Users/ssingh355/code/invoice_processing/ip_cl...,"Facebook, Inc.\nINVOICE\n1601 Willow Rd\nFACEB...",Facebook Inc-V1,14,"{'predicted_doctype': 'Facebook Inc-V1', 'conf...",Facebook Inc-V1,0.641507
457,/Users/ssingh355/code/invoice_processing/ip_cl...,jiffy lube\nMY FLEET CENTER\nSTATEMENT SUMMARY...,Other-Document,20,{'predicted_doctype': 'Dell Financial Services...,Dell Financial Services-V1,0.21776
464,/Users/ssingh355/code/invoice_processing/ip_cl...,3/30/2020\nhttps://uber.coupahost.com/order_he...,Other-Document,20,"{'predicted_doctype': 'Google LLC', 'conf_scor...",Google LLC,0.25007
466,/Users/ssingh355/code/invoice_processing/ip_cl...,Work Order Create Date Worker ID\nWork Order I...,Other-Document,20,"{'predicted_doctype': 'CDW Canada Corp-V1', 'c...",CDW Canada Corp-V1,0.135564
467,/Users/ssingh355/code/invoice_processing/ip_cl...,Medmark Ltd\n69 Lower Baggot Street\nDublin 2\...,Other-Document,20,{'predicted_doctype': 'The Siegfried Group LLC...,The Siegfried Group LLC-V2,0.091652
468,/Users/ssingh355/code/invoice_processing/ip_cl...,"C REDIT SERVC ES\nCDW Limited, 1 New Change, L...",Other-Document,20,"{'predicted_doctype': 'CDW Canada Corp-V1', 'c...",CDW Canada Corp-V1,0.223522


In [39]:
df_low_confidence[2].value_counts()

Taxback International-V1                        13
Other-Document                                   6
1011656 - FACEBOOK NETHERLANDS BV-English-V1     4
WIPRO LIMITED-V2                                 1
Facebook Inc-V1                                  1
FOXBOX RETAIL PRIVATE LIMITED-V1                 1
Name: 2, dtype: int64