### Option 1: Direct Prediction from xlsx file

In [3]:
from src.predictor import Predictor

In [5]:
p = Predictor()

output_df = p.predict_from_excel(
    master_xlsx_path='./data/raw/Product Matching Dataset.xlsx',
    test_xlsx_path='./data/preprocessed/validation_dataset.xlsx',
    output_xlsx_path='./validation_output.xlsx',
    master_sheet='Master File',
    test_sheet="validation_data",
    output_sheet_name="validation_output",
    query_names_column='seller_item_name',
    query_prices_column='price',
    master_candidate_names_column='product_name_ar',
    master_prices_column='price',
    k=3
)

checkpoints/checkpoint_epoch_15.pth
Loaded master and test data from Excel files.
Data cleaning completed.
Model start...
Model done finding most similar candidates.
DataFrame saved successfully to new file ./validation_output.xlsx in sheet 'validation_output'.
Predictions saved to ./validation_output.xlsx


In [14]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Measures of the top 1 candidate is the correct one,
# it doesn't account for 2nd and 3rd place.
print(f"Precision Score : {precision_score(output_df['sku'], output_df['sku1'], average='micro')}")
print(f"Recall Score    : {recall_score(output_df['sku'], output_df['sku1'], average='micro')}")
print(f"F1 Score        : {f1_score(output_df['sku'], output_df['sku1'], average='micro')}")

Precision Score : 0.985361653272101
Recall Score    : 0.985361653272101
F1 Score        : 0.985361653272101


### Option 2: Manual loading and prediction

In [7]:
from src.predictor import Predictor
import pandas as pd
from src.utils import add_top_k_scores_to_df, save_dataframe_to_xlsx

In [8]:
valid_df = pd.read_csv('./data/preprocessed/validation_dataset.csv')
master_file = pd.read_csv('./data/preprocessed/master_file.csv')

queries, query_prices = valid_df['seller_item_name'], valid_df['price']
candidates, candidate_prices = master_file['product_name_ar'], master_file['price']

In [9]:
p = Predictor()

scores = [p.candidate_ranking(
    query, candidates, query_prices[idx],
    candidate_prices, sort=True
) for idx, query in enumerate(queries)]         # shape: (queries num, candidates num, 3)

checkpoints/checkpoint_epoch_15.pth


In [10]:
valid_df = add_top_k_scores_to_df(valid_df, scores, master_file, k=3)

In [11]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Measures of the top 1 candidate is the correct one, doesn't account for 2nd and 3rd place.
print(f"Precision Score : {precision_score(valid_df['sku'], valid_df['sku1'], average='micro')}")
print(f"Recall Score    : {recall_score(valid_df['sku'], valid_df['sku1'], average='micro')}")
print(f"F1 Score        : {f1_score(valid_df['sku'], valid_df['sku1'], average='micro')}")

Precision Score : 0.985361653272101
Recall Score    : 0.985361653272101
F1 Score        : 0.985361653272101


In [13]:
# all the example where the correct one is not in the top 3
not_in_top3 = valid_df[(valid_df['sku'] != valid_df['sku1']) & (valid_df['sku'] != valid_df['sku2']) & (valid_df['sku'] != valid_df['sku3'])]

print(f"Correct candidate not in top three: {not_in_top3.shape[0]}")

Correct candidate not in top three: 27


In [16]:
save_dataframe_to_xlsx(
    df=valid_df,
    file_path='./validation_output.xlsx',
    sheet_name='Results'
)

DataFrame saved successfully to ./validation_output.xlsx
