In [None]:
%%time
import sys
if not r'G:\PythonProjects\WineRecognition2' in sys.path:
    sys.path.insert(0, r'G:\PythonProjects\WineRecognition2')
import os
import pickle
import json
import torch
import pandas as pd
from data_master import DataGenerator
from mlflow_utils import log_mlflow_on_test

In [None]:
%%time
MODEL_PATH = r"G:/PythonProjects/WineRecognition2/artifacts/train/CRF_with_LSTM_features_17112021_170520"
DATASET_PATH = r"G:\PythonProjects\WineRecognition2\data\text\WineSearcher_Wine_AU-all_rows-all_keys.txt"
LSTM_MODEL_PATH = r"G:\PythonProjects\WineRecognition2\artifacts\train\BiLSTM_CRF_10112021_030733\model\data\model.pth"
VOCAB_PATH = r"G:\PythonProjects\WineRecognition2\data\vocabs\Words_Halliday_Wine_AU.json"
DEVICE = 'cuda'
OUTPUT_DIR = ''
START_TIME = ''
RUN_NAME = 'test_run'
COMPUTE_METRICS = False

In [None]:
%%time
with open(os.path.join(MODEL_PATH, 'model', 'model.pkl'), 'rb') as file:
    model = pickle.load(file)

bilstm_crf = torch.load(LSTM_MODEL_PATH).to(DEVICE).eval()
with open(VOCAB_PATH, 'r', encoding='utf-8') as file:
    word_to_ix = json.load(file)

In [None]:
%%time
# getting features from lstm

def get_lstm_features(model, x):
    x = model.embedding(x)
    x, _ = model.lstm(x)
    return x

def features_with_keys(sentence):
    return [{f'A{i}': feature for i, feature in enumerate(features)} for features in sentence]

def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

def preprocess_sent(x_sent, y_sent):
    indices = [i for i, (x, y) in enumerate(zip(x_sent, y_sent)) if y == 'Add_BottleSize' and is_number(x)]
    for i in indices:
        x_sent[i] = str(float(x_sent[i]))

with torch.no_grad():
    sents = DataGenerator.generate_sents2(open(DATASET_PATH).read().split('\n'))
    metadata = {'features': [], 'labels': []}

    for x_sent, y_sent in sents:
        preprocess_sent(x_sent, y_sent)
        x_tensor = torch.tensor(
            [word_to_ix[word] if word in word_to_ix else word_to_ix['UNK'] for word in x_sent],
            dtype=torch.int64
        ).to(DEVICE)
        features = get_lstm_features(bilstm_crf, x_tensor.unsqueeze(0))
        features = features.squeeze(0).detach().cpu().numpy()
        metadata['features'].append(features_with_keys(features))
        metadata['labels'].append(y_sent)
y_test = metadata['labels']
X_test = metadata['features']
len(X_test), len(y_test)

In [None]:
%%time
y_pred = model.predict(X_test)

In [None]:
%%time
if COMPUTE_METRICS:
    test_eval = [list(zip(sentence, tags, y_pred[index])) for index, (sentence, tags) in enumerate(sents)]  
else:
    test_eval = []
    for i, (sentence, tags) in enumerate(sents):
        dct = dict.fromkeys(model.classes_, '')
        for j, word in enumerate(sentence):
            if y_pred[i][j] in dct.keys():
                dct[y_pred[i][j]] += f'{word}'
        test_eval.append({key: value.rstrip() for key, value in dct.items()})
    test_eval = pd.DataFrame({key: [wine.get(key) for wine in test_eval] for key in model.classes_})

In [None]:
%%time
run_params = {
    'model_path': MODEL_PATH,
    'lstm_model_path': LSTM_MODEL_PATH,
    'vocab_path': VOCAB_PATH,
    'device': DEVICE,
    'dataset_path': DATASET_PATH,
    'output_dir': OUTPUT_DIR,
    'compute_metrics': COMPUTE_METRICS,
    'start_time': START_TIME,
    'runname': RUN_NAME
}

In [None]:
log_mlflow_on_test(
    run_params=run_params,
    model=model,
    y_true=y_test,
    y_pred=y_pred,
    test_eval=test_eval
)