In [7]:
## Training ML models to predict the label
## Using SVM, Random Forest, Decision Tree, Logistic Regression
import pandas as pd
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

## Get training and testing set
dataset = pd.read_csv('./labeled_candidate_set.csv')

## Choose some attribute to train the model
dataset = dataset[['ltable_invoice_item_id', 'rtable_purchase_order_item_id', 'name_similarity', 'unit_similarity', 'unit_price_similarity', 'quantity_similarity', 'total_price_before_vat_similarity', 'vat_amount_similarity', 'label']]

train = dataset.sample(frac=0.8, random_state=2)
test = dataset.drop(train.index)

## Create the model
SVM = svm.SVC(kernel='rbf')
RF = RandomForestClassifier(n_estimators=100)
DT = DecisionTreeClassifier()
LR = LogisticRegression()

## Train the model
SVM.fit(train[['name_similarity', 'unit_similarity', 'unit_price_similarity', 'quantity_similarity', 'total_price_before_vat_similarity', 'vat_amount_similarity']], train['label'])
RF.fit(train[['name_similarity', 'unit_similarity', 'unit_price_similarity', 'quantity_similarity', 'total_price_before_vat_similarity', 'vat_amount_similarity']], train['label'])
DT.fit(train[['name_similarity', 'unit_similarity', 'unit_price_similarity', 'quantity_similarity', 'total_price_before_vat_similarity', 'vat_amount_similarity']], train['label'])
LR.fit(train[['name_similarity', 'unit_similarity', 'unit_price_similarity', 'quantity_similarity', 'total_price_before_vat_similarity', 'vat_amount_similarity']], train['label'])

## Test the model
SVM_pred = SVM.predict(test[['name_similarity', 'unit_similarity', 'unit_price_similarity', 'quantity_similarity', 'total_price_before_vat_similarity', 'vat_amount_similarity']])
RF_pred = RF.predict(test[['name_similarity', 'unit_similarity', 'unit_price_similarity', 'quantity_similarity', 'total_price_before_vat_similarity', 'vat_amount_similarity']])
DT_pred = DT.predict(test[['name_similarity', 'unit_similarity', 'unit_price_similarity', 'quantity_similarity', 'total_price_before_vat_similarity', 'vat_amount_similarity']])
LR_pred = LR.predict(test[['name_similarity', 'unit_similarity', 'unit_price_similarity', 'quantity_similarity', 'total_price_before_vat_similarity', 'vat_amount_similarity']])
print('SVM accuracy:', accuracy_score(test['label'], SVM_pred))
print('SVM precision:', precision_score(test['label'], SVM_pred))
print('SVM recall:', recall_score(test['label'], SVM_pred))
print('SVM f1:', f1_score(test['label'], SVM_pred))

print('RF accuracy:', accuracy_score(test['label'], RF_pred))
print('RF precision:', precision_score(test['label'], RF_pred))
print('RF recall:', recall_score(test['label'], RF_pred))
print('RF f1:', f1_score(test['label'], RF_pred))

print('DT accuracy:', accuracy_score(test['label'], DT_pred))
print('DT precision:', precision_score(test['label'], DT_pred))
print('DT recall:', recall_score(test['label'], DT_pred))
print('DT f1:', f1_score(test['label'], DT_pred))

print('LR accuracy:', accuracy_score(test['label'], LR_pred))
print('LR precision:', precision_score(test['label'], LR_pred))
print('LR recall:', recall_score(test['label'], LR_pred))
print('LR f1:', f1_score(test['label'], LR_pred))

SVM accuracy: 0.9921594982078853
SVM precision: 0.9743589743589743
SVM recall: 0.9457013574660633
SVM f1: 0.9598163030998853
RF accuracy: 0.9919354838709677
RF precision: 0.9634703196347032
RF recall: 0.9547511312217195
RF f1: 0.959090909090909
DT accuracy: 0.9903673835125448
DT precision: 0.9586206896551724
DT recall: 0.9434389140271493
DT f1: 0.9509692132269099
LR accuracy: 0.9908154121863799
LR precision: 0.971764705882353
LR recall: 0.9343891402714932
LR f1: 0.9527104959630911


In [8]:
## Save the model using pickle
import pickle

with open('SVM_model.pkl', 'wb') as f:
    pickle.dump(SVM, f)

with open('RF_model.pkl', 'wb') as f:
    pickle.dump(RF, f)

with open('DT_model.pkl', 'wb') as f:
    pickle.dump(DT, f)

with open('LR_model.pkl', 'wb') as f:
    pickle.dump(LR, f)

In [9]:
import pickle
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

dataset = pd.read_csv('./validate_candidate_set.csv')

## Load the model using pickle to predict the label
svm_model = pickle.load(open('SVM_model.pkl', 'rb'))
rf_model = pickle.load(open('RF_model.pkl', 'rb'))
dt_model = pickle.load(open('DT_model.pkl', 'rb'))
lr_model = pickle.load(open('LR_model.pkl', 'rb'))

## Predict the label
testing = pd.read_csv('./candidate_set.csv')

print(len(testing), len(dataset))

testing = testing[['ltable_invoice_item_id', 'rtable_purchase_order_item_id', 'name_similarity', 'unit_similarity', 'unit_price_similarity', 'quantity_similarity', 'total_price_before_vat_similarity', 'vat_amount_similarity']]
svm_predict = svm_model.predict(testing[['name_similarity', 'unit_similarity', 'unit_price_similarity', 'quantity_similarity', 'total_price_before_vat_similarity', 'vat_amount_similarity']])
rf_predict = rf_model.predict(testing[['name_similarity', 'unit_similarity', 'unit_price_similarity', 'quantity_similarity', 'total_price_before_vat_similarity', 'vat_amount_similarity']])
dt_predict = dt_model.predict(testing[['name_similarity', 'unit_similarity', 'unit_price_similarity', 'quantity_similarity', 'total_price_before_vat_similarity', 'vat_amount_similarity']])
lr_predict = lr_model.predict(testing[['name_similarity', 'unit_similarity', 'unit_price_similarity', 'quantity_similarity', 'total_price_before_vat_similarity', 'vat_amount_similarity']])

## Evaluate the model
print('SVM accuracy:', accuracy_score(dataset['label'], svm_predict))
print('SVM precision:', precision_score(dataset['label'], svm_predict))
print('SVM recall:', recall_score(dataset['label'], svm_predict))
print('SVM f1:', f1_score(dataset['label'], svm_predict))

print('RF accuracy:', accuracy_score(dataset['label'], rf_predict))
print('RF precision:', precision_score(dataset['label'], rf_predict))
print('RF recall:', recall_score(dataset['label'], rf_predict))
print('RF f1:', f1_score(dataset['label'], rf_predict))

print('DT accuracy:', accuracy_score(dataset['label'], dt_predict))
print('DT precision:', precision_score(dataset['label'], dt_predict))
print('DT recall:', recall_score(dataset['label'], dt_predict))
print('DT f1:', f1_score(dataset['label'], dt_predict))

print('LR accuracy:', accuracy_score(dataset['label'], lr_predict))
print('LR precision:', precision_score(dataset['label'], lr_predict))
print('LR recall:', recall_score(dataset['label'], lr_predict))
print('LR f1:', f1_score(dataset['label'], lr_predict))

3169438 3169438
SVM accuracy: 0.9965110533791796
SVM precision: 0.16312977099236642
SVM recall: 0.9574372759856631
SVM f1: 0.27876337072788937
RF accuracy: 0.9959589050172302
RF precision: 0.14657131399545514
RF recall: 0.9825268817204301
RF f1: 0.25508898452948703
DT accuracy: 0.9953707881334167
DT precision: 0.13002617178206044
DT recall: 0.9793906810035843
DT f1: 0.22957361898760767
LR accuracy: 0.996800379120841
LR precision: 0.17261362695587382
LR recall: 0.9341397849462365
LR f1: 0.29138424987771644
