In [1]:
import sys
import json
import string
import torch
import numpy as np

if r'G:\PythonProjects\WineRecognition2' not in sys.path:
    sys.path.insert(0, r'G:\PythonProjects\WineRecognition2')

from nn.utils import generate_tag_to_ix, get_model_confidence
from nn.mlflow_utils import log_mlflow_on_test
from data_master import DataGenerator, count_unk_foreach_tag

In [2]:
MODEL_NAME = 'BiLSTM_CRF'
MODEL_PATH = 'G:/PythonProjects/WineRecognition2/artifacts/train/BiLSTM_CRF_10112021_032950/model/data/model.pth'
RUN_NAME = 'Test-100-256'
START_TIME = ''
OUTPUT_DIR = 'G:/PythonProjects/WineRecognition2/artifacts/test/test'
DATA_PATH = r'G:\PythonProjects\WineRecognition2\data\text\menu_txt.txt'
VOCAB_PATH = 'G:/PythonProjects/WineRecognition2/data/vocabs/Words_Halliday_Wine_AU.json'
DATAINFO_PATH = 'G:/PythonProjects/WineRecognition2/data_info.json'
COMPUTE_METRICS = False
CASE_SENSITIVE_VOCAB = True
DEVICE = 'cpu'

In [3]:
with open(VOCAB_PATH, 'r', encoding='utf-8') as file:
    word_to_ix = json.load(file)
if not CASE_SENSITIVE_VOCAB:
    word_to_ix = {word.lower(): index for word, index in word_to_ix.items()}
len(word_to_ix)

12139

In [4]:
with open(DATA_PATH, encoding='utf-8') as file:
    x_test = DataGenerator.generate_sents2(file.read().split('\n'))

In [5]:
x_tensor = []
y_test = []
unk = 'UNK' if CASE_SENSITIVE_VOCAB else 'unk'
for index, (sentence, tags) in enumerate(x_test):
    y_test.append(tags)
    if not CASE_SENSITIVE_VOCAB:
        sentence = [word.lower() for word in sentence]
    x_tensor.append(torch.tensor([word_to_ix[word] if word in word_to_ix else word_to_ix[unk] for word in sentence], dtype=torch.int64))
x_tensor

[tensor([12047,  1612,   222,   124,    11,   222,   124,  6362,  2646,  3152]),
 tensor([12047,   895,   619,  8847,     4,   222,   124,  6362,  2646, 12138]),
 tensor([12041,   528,   279,   222,   124,  6362,  2646, 12138]),
 tensor([12041, 10532,    15,  2913,  1897,  6362,  2646, 12138]),
 tensor([12047, 10330,    13,   366,  2913,  1897,  6362,  2646,  7593]),
 tensor([12041, 10180,   531,  3026,    23,  3373,   531,  6362,  6406,  3026,
         12138]),
 tensor([12047,   822,   222,   124,  6398, 12138]),
 tensor([12041,  6315,  3268,  6582,  6292,   124,  6582, 12138]),
 tensor([12036, 10342,  6360,  3643,   222,   124,  6582, 12138]),
 tensor([12036,   528, 10146,  3251,  6292,   124, 12138,  3152]),
 tensor([12138, 12110, 12138,  6361,  6362, 12138,   221,   124, 12138, 12138]),
 tensor([10785, 11940,  6369,  6292,   124, 12138,  8090]),
 tensor([12138, 12110, 12138,  6393,   221,   124, 12138, 12138]),
 tensor([10903,  1119, 12138,  6293, 12138]),
 tensor([10170, 12138, 10

In [6]:
model = torch.load(MODEL_PATH).to(DEVICE)
model.eval()

BiLSTM_CRF(
  (embedding): Embedding(12139, 256, padding_idx=12137)
  (lstm): LSTM(256, 64, batch_first=True, bidirectional=True)
  (hidden2tags): Linear(in_features=128, out_features=15, bias=True)
  (crf): CRF(num_tags=15)
)

In [7]:
y_pred = []
confs = []
with torch.no_grad():
    for x in x_tensor:
        x = x.unsqueeze(0).to(DEVICE)
        best_tag_sequence = model(x)
        confidence = torch.exp(-model.neg_log_likelihood(x, torch.tensor(best_tag_sequence)))
        y_pred.append(best_tag_sequence)
        confs.append(confidence.item())

  score = torch.where(mask[i].unsqueeze(1), next_score, score)


In [8]:
with open(DATAINFO_PATH) as file:
    tag_to_ix = generate_tag_to_ix(json.load(file)['keys']['all'])
ix_to_tag = {value: key for key, value in tag_to_ix.items()}
ix_to_tag

{0: 'Add_TradeName',
 1: 'Add_Brand',
 2: 'Add_KeyWordTrue',
 3: 'Add_KeyWordFalse',
 4: 'Add_GrapeVarieties',
 5: 'Add_GeoIndication',
 6: 'Add_WineType',
 7: 'Add_BottleSize',
 8: 'Add_Sweetness',
 9: 'Add_WineColor',
 10: 'Add_ClosureType',
 11: 'Add_Certificate',
 12: 'Add_Vintage',
 13: 'Add_Price',
 14: 'Punctuation',
 15: 'Other'}

In [9]:
unk_foreach_tag = count_unk_foreach_tag(x_tensor, y_test, list(tag_to_ix), word_to_ix[unk])

In [9]:
for index, pred in enumerate(y_pred):
    y_pred[index] = [ix_to_tag[tag] for tag in pred[0]]

In [11]:
run_params = {
    'model_name': MODEL_NAME,
    'model_path': MODEL_PATH,
    'run_name': RUN_NAME,
    'start_time': START_TIME,
    'output_dir': OUTPUT_DIR,
    'data_path': DATA_PATH,
    'vocab_path': VOCAB_PATH,
    'datainfo_path': DATAINFO_PATH,
    'case_sensitive_vocab': CASE_SENSITIVE_VOCAB,
    'device': DEVICE,
    'models_confidence': np.mean(confs),
    'compute_metrics': COMPUTE_METRICS,
    'unk_foreach_tag': json.dumps(unk_foreach_tag)
}

In [None]:
log_mlflow_on_test(
    run_params,
    classes=list(ix_to_tag.values()),
    x_test=[sentence for sentence, _ in x_test],
    y_pred=y_pred,
    y_true=y_test
)