In [3]:
import joblib
import traceback
import os
import pandas as pd

In [4]:
model_path = "/Users/jghosh2/Documents/my-notebook/BOL_CODE_REF/01_Pepsi_bol/01_Classifier/06_models/2021_09_03_11_14_19"

In [7]:
def return_categoy(predictions, labels):
    testing_predictions = []
    for i in range(len(predictions)):
        labelidx = predictions[i]
        predicted_label = labels[labelidx]
        testing_predictions.append(predicted_label)
    return testing_predictions

In [13]:
def load_model(model_path):
    model_file_path = model_path + '/' + 'model.jbl'
    vectorizer_path = model_path + '/' + 'vectorizer.jbl'
    transformer_path = model_path + '/' + 'transformer.jbl'
    label_file = model_path + '/' + 'labels.txt'
    ## Load model here
    model = joblib.load(model_file_path)
    print(model)
    vectorizer = joblib.load(vectorizer_path)
    transformer = joblib.load(transformer_path)
    print("Models loaded")
    with open(label_file, 'r') as labelfile:
        labels = labelfile.read().splitlines()
    text_model = {
        "model": model,
        "vectorizer": vectorizer,
        "transformer": transformer,
        "labels": labels
    }
    print(text_model)
#         loaded_models[model_name] = text_model
    return text_model

In [9]:
def classify_text(text, model_dict):
    """

    :param text:
    :return:
    """

#     text_bytes = base64.b64decode(text_base64)
#     text = text_bytes.decode('ascii')
    text = [text]

    try:
        vectorizer = model_dict["vectorizer"]
        transformer = model_dict["transformer"]
        model = model_dict["model"]
        labels = model_dict["labels"]
    except:
        tb = traceback.format_exc()
        return ["MODEL_FETCH_EXCEPTION" + "\n" + tb]

    try:
        # Convert to bag of words
        X = vectorizer.transform(text)
        # Convert from occurrences to frequencies
        X = transformer.transform(X)
    except:
        tb = traceback.format_exc()
        return ["VECTORIZING_EXCEPTION" + "\n" + tb]

    try:
        predictions = model.predict(X)
        conf_score = model.predict_proba(X).max()
    except:
        tb = traceback.format_exc()
        return ["PREDICTION_EXCEPTION" + "\n" + tb]

    try:
        result = return_categoy(predictions, labels)
    except:
        tb = traceback.format_exc()
        return ["LABEL_EXTRACTION_EXCEPTION" + "\n" + tb]

    success_msg = {"predicted_doctype": str(result[0]), "conf_score": conf_score}

    return [200, success_msg]

In [14]:
model_dict = load_model(model_path)

MultinomialNB()
Models loaded
{'model': MultinomialNB(), 'vectorizer': CountVectorizer(stop_words='english', strip_accents='ascii',
                token_pattern='(?ui)\\b\\w*[a-z]+\\w*\\b'), 'transformer': TfidfTransformer(), 'labels': ['Format1', 'Format2', 'Others', 'Packing List']}


In [7]:
DATASET_INP_PATH = "/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_Classifier/04_Data"

In [8]:
dirs = os.listdir(DATASET_INP_PATH)
dirs

['Format1', 'Format2', 'Others', 'Packing List']

In [15]:
dataset = []
for class_folder in dirs:
    if class_folder != ".DS_Store":
#         label_list.append(class_folder)
        class_folder_path = os.path.join(DATASET_INP_PATH, class_folder, 'test')
        class_files = os.listdir(class_folder_path)
        for file in class_files:
            if file != ".DS_Store":
                fp = os.path.join(class_folder_path, file)
                with open(fp, 'r', encoding='utf-8') as f:
                    raw_text = f.read()
                data = [fp, raw_text, class_folder]
                dataset.append(data)

NameError: name 'dirs' is not defined

In [10]:
len(dataset)

67

In [11]:
df = pd.DataFrame(dataset)
df.head()

Unnamed: 0,0,1,2
0,/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_C...,Pepsi Beverages Company\nBILL OF LADING\nOrder...,Format1
1,/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_C...,Pepsi Beveragee dompany\nBILL OF LADING\nBOL #...,Format1
2,/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_C...,BILL OF LADING\nPepsi Beverages Company\nOrder...,Format1
3,/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_C...,Pepsi Beverages Company\nBILL OF LADING\nÊD000...,Format1
4,/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_C...,Pepsi Beverages Company\nBILL OF LADING\nBOL #...,Format1


In [12]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['labels']= label_encoder.fit_transform(df[2])

In [13]:
df['labels'].unique()

array([0, 1, 2, 3])

In [14]:
df.head()

Unnamed: 0,0,1,2,labels
0,/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_C...,Pepsi Beverages Company\nBILL OF LADING\nOrder...,Format1,0
1,/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_C...,Pepsi Beveragee dompany\nBILL OF LADING\nBOL #...,Format1,0
2,/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_C...,BILL OF LADING\nPepsi Beverages Company\nOrder...,Format1,0
3,/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_C...,Pepsi Beverages Company\nBILL OF LADING\nÊD000...,Format1,0
4,/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_C...,Pepsi Beverages Company\nBILL OF LADING\nBOL #...,Format1,0


In [15]:
predicted_results = []
for index, row in df.iterrows():
    raw_text = row[1]
    result = classify_text(raw_text, model_dict)
    if result[0] != 200:
        predicted_result = result
    else:
        predicted_result = result[1]
    predicted_results.append(predicted_result)

In [16]:
predicted_results

[{'predicted_doctype': 'Format1', 'conf_score': 0.9879960553600494},
 {'predicted_doctype': 'Format1', 'conf_score': 0.9936402433149718},
 {'predicted_doctype': 'Format1', 'conf_score': 0.97075364045218},
 {'predicted_doctype': 'Format1', 'conf_score': 0.9447205047770952},
 {'predicted_doctype': 'Format1', 'conf_score': 0.986972387225729},
 {'predicted_doctype': 'Format1', 'conf_score': 0.9913250462117423},
 {'predicted_doctype': 'Format1', 'conf_score': 0.9831001381993777},
 {'predicted_doctype': 'Format1', 'conf_score': 0.9925883848126469},
 {'predicted_doctype': 'Format1', 'conf_score': 0.9925145895013731},
 {'predicted_doctype': 'Format1', 'conf_score': 0.9301867771733758},
 {'predicted_doctype': 'Format1', 'conf_score': 0.897946906744762},
 {'predicted_doctype': 'Format1', 'conf_score': 0.9926502673644031},
 {'predicted_doctype': 'Format1', 'conf_score': 0.9560432155420077},
 {'predicted_doctype': 'Format1', 'conf_score': 0.9982109556113351},
 {'predicted_doctype': 'Format1', 'con

In [17]:
df["predicted_results"] = predicted_results

In [18]:
df["predicted_label"] = [i['predicted_doctype'] for i in predicted_results]

In [19]:
df

Unnamed: 0,0,1,2,labels,predicted_results,predicted_label
0,/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_C...,Pepsi Beverages Company\nBILL OF LADING\nOrder...,Format1,0,"{'predicted_doctype': 'Format1', 'conf_score':...",Format1
1,/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_C...,Pepsi Beveragee dompany\nBILL OF LADING\nBOL #...,Format1,0,"{'predicted_doctype': 'Format1', 'conf_score':...",Format1
2,/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_C...,BILL OF LADING\nPepsi Beverages Company\nOrder...,Format1,0,"{'predicted_doctype': 'Format1', 'conf_score':...",Format1
3,/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_C...,Pepsi Beverages Company\nBILL OF LADING\nÊD000...,Format1,0,"{'predicted_doctype': 'Format1', 'conf_score':...",Format1
4,/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_C...,Pepsi Beverages Company\nBILL OF LADING\nBOL #...,Format1,0,"{'predicted_doctype': 'Format1', 'conf_score':...",Format1
...,...,...,...,...,...,...
62,/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_C...,Packing List\nWarehouse ID\nOrder Number: 8253...,Packing List,3,"{'predicted_doctype': 'Packing List', 'conf_sc...",Packing List
63,/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_C...,2.\nPacking List\nItem Number\nGross Weight\nQ...,Packing List,3,"{'predicted_doctype': 'Packing List', 'conf_sc...",Packing List
64,/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_C...,Packing List\nShip From:\nOrder Number: 825425...,Packing List,3,"{'predicted_doctype': 'Packing List', 'conf_sc...",Packing List
65,/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_C...,Packing List\nShip From:\nOrder Number: 825189...,Packing List,3,"{'predicted_doctype': 'Packing List', 'conf_sc...",Packing List


In [20]:
df["predicted_confidence"] = [i['conf_score'] for i in predicted_results]

In [21]:
wrong_predictions = 0
for index, row in df.iterrows():
    if df.iloc[index,2] != df.loc[index,'predicted_label']:
        wrong_predictions+=1
#         print(os.path.basename(df.iloc[index,0]))
        print(df.iloc[index,2], df.loc[index,'predicted_label'], df.loc[index,'predicted_confidence'])

Others Format2 0.4841713947361479
Others Format1 0.2916666666666667
Packing List Format2 0.32598908454822034
Packing List Format1 0.4043761213525272


In [22]:
wrong_predictions = 0
for index, row in df.iterrows():
    if df.iloc[index,2] != df.loc[index,'predicted_label']:
        wrong_predictions+=1
#         print(os.path.basename(df.iloc[index,0]))
        print(df.iloc[index,0])

/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_Classifier/04_Data\Others\test\1700184814-PROOF_OF_DELIVERY (1).JPG_raw_text.txt
/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_Classifier/04_Data\Others\test\desktop.ini
/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_Classifier/04_Data\Packing List\test\1_PEPSI2_PROOF_OF_DELIVERY.PDF.jpg_raw_text.txt
/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_Classifier/04_Data\Packing List\test\2_combinepdf (45).pdf.jpg_raw_text.txt


In [23]:
print("Accuracy = {}".format(1 - wrong_predictions/len(df['predicted_label'])))

Accuracy = 0.9402985074626866


In [24]:
print("AVG_Conf_score = {}".format(sum(df['predicted_confidence'])/len(df['predicted_confidence'])))

AVG_Conf_score = 0.8392395236549749


In [25]:
df

Unnamed: 0,0,1,2,labels,predicted_results,predicted_label,predicted_confidence
0,/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_C...,Pepsi Beverages Company\nBILL OF LADING\nOrder...,Format1,0,"{'predicted_doctype': 'Format1', 'conf_score':...",Format1,0.987996
1,/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_C...,Pepsi Beveragee dompany\nBILL OF LADING\nBOL #...,Format1,0,"{'predicted_doctype': 'Format1', 'conf_score':...",Format1,0.993640
2,/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_C...,BILL OF LADING\nPepsi Beverages Company\nOrder...,Format1,0,"{'predicted_doctype': 'Format1', 'conf_score':...",Format1,0.970754
3,/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_C...,Pepsi Beverages Company\nBILL OF LADING\nÊD000...,Format1,0,"{'predicted_doctype': 'Format1', 'conf_score':...",Format1,0.944721
4,/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_C...,Pepsi Beverages Company\nBILL OF LADING\nBOL #...,Format1,0,"{'predicted_doctype': 'Format1', 'conf_score':...",Format1,0.986972
...,...,...,...,...,...,...,...
62,/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_C...,Packing List\nWarehouse ID\nOrder Number: 8253...,Packing List,3,"{'predicted_doctype': 'Packing List', 'conf_sc...",Packing List,0.890655
63,/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_C...,2.\nPacking List\nItem Number\nGross Weight\nQ...,Packing List,3,"{'predicted_doctype': 'Packing List', 'conf_sc...",Packing List,0.814265
64,/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_C...,Packing List\nShip From:\nOrder Number: 825425...,Packing List,3,"{'predicted_doctype': 'Packing List', 'conf_sc...",Packing List,0.719086
65,/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_C...,Packing List\nShip From:\nOrder Number: 825189...,Packing List,3,"{'predicted_doctype': 'Packing List', 'conf_sc...",Packing List,0.793883


In [26]:
df_low_confidence = df.loc[df['predicted_confidence'] <= 0.65]

In [27]:
len(df_low_confidence)

14

In [28]:
df_low_confidence

Unnamed: 0,0,1,2,labels,predicted_results,predicted_label,predicted_confidence
39,/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_C...,of sald route jo deştinatio. Straight Bijach p...,Others,2,"{'predicted_doctype': 'Others', 'conf_score': ...",Others,0.559047
41,/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_C...,Capstone Lqgistics\n30 TECHNOOLOGY PKWY SOUTH ...,Others,2,"{'predicted_doctype': 'Others', 'conf_score': ...",Others,0.614476
42,/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_C...,COLD SPRING BREWING COMPANY TRAILER INSPECTION...,Others,2,"{'predicted_doctype': 'Others', 'conf_score': ...",Others,0.337297
43,/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_C...,Arduni alnlgmeN ihai al laadins has een ued an...,Others,2,"{'predicted_doctype': 'Format2', 'conf_score':...",Format2,0.484171
45,/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_C...,MASTER BILL OF LADING\nSHOW THI! BILL OF LADIN...,Others,2,"{'predicted_doctype': 'Others', 'conf_score': ...",Others,0.522955
50,/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_C...,MASTER BILL OF LADING\nSHOW THI! BILL OF LADIN...,Others,2,"{'predicted_doctype': 'Others', 'conf_score': ...",Others,0.641806
51,/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_C...,11/3/2020\nScan2020-11-03_140153 091.jpg\nubeR...,Others,2,"{'predicted_doctype': 'Others', 'conf_score': ...",Others,0.550998
54,/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_C...,\n,Others,2,"{'predicted_doctype': 'Format1', 'conf_score':...",Format1,0.291667
55,/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_C...,S68\nEACELLENC\nDELIVERY NOTE - Item\nPage:\nc...,Others,2,"{'predicted_doctype': 'Others', 'conf_score': ...",Others,0.639734
56,/Users/hsingh151/My_Work/BOL/01_Pepsi_bol/01_C...,g00 Lot\n49939\nTrip Delivery Sheet\nTrip Info...,Others,2,"{'predicted_doctype': 'Others', 'conf_score': ...",Others,0.547014


In [29]:
df_low_confidence[2].value_counts()

Others          10
Packing List     4
Name: 2, dtype: int64

In [30]:
df_format1 = df.loc[df['labels'] == 0]

In [31]:
print("AVG_Conf_score format1 = {}".format(sum(df_format1['predicted_confidence'])/len(df_format1['predicted_confidence'])))

AVG_Conf_score format1 = 0.9747678771307307


In [32]:
df_format2 = df.loc[df['labels'] == 1]

In [33]:
print("AVG_Conf_score format2 = {}".format(sum(df_format2['predicted_confidence'])/len(df_format2['predicted_confidence'])))

AVG_Conf_score format2 = 0.9857647865495446


In [34]:
df_others = df.loc[df['labels'] == 2]

In [35]:
print("AVG_Conf_score Other = {}".format(sum(df_others['predicted_confidence'])/len(df_others['predicted_confidence'])))

AVG_Conf_score Other = 0.6392260371124331


In [36]:
df_packinglist = df.loc[df['labels'] == 3]

In [37]:
print("AVG_Conf_score Packing List = {}".format(sum(df_packinglist['predicted_confidence'])/len(df_packinglist['predicted_confidence'])))

AVG_Conf_score Packing List = 0.648709402038475


In [39]:
#df_packinglist