In [1]:
%%time
import sys
from joblib import Parallel, delayed
if not r'G:\PythonProjects\WineRecognition2' in sys.path:
    sys.path.insert(0, r'G:\PythonProjects\WineRecognition2')

import os
import json
import torch
import numpy as np
import pandas as pd
import sklearn_crfsuite
import eli5
from sklearn.model_selection import train_test_split

from data_master import DataGenerator, DataLoader, count_unk_foreach_tag, compute_model_confidence
from nn.utils import CustomDataset, generate_tag_to_ix
from mlflow_utils import log_mlflow_on_train
from features import features

Wall time: 1.97 s


In [2]:
%%time
DATASET_PATH = r'G:\PythonProjects\WineRecognition2\data\text\halliday_winesearcher_menu_gen_samplesv2\Halliday_WineSearcher_MenuGenSamples.txt'
LSTM_MODEL_PATH = 'G:/PythonProjects/WineRecognition2/artifacts/train/BiLSTM_CRF_17022022_185854/model/data/model.pth'
VOCAB_PATH = 'G:/PythonProjects/WineRecognition2/data/vocabs/Words_Halliday_Wine_AU_WORD_NUMS.json'
CASE_SENSITIVE_VOCAB = False
DICTIONARY_PATH = r"G:\PythonProjects\WineRecognition2\data\dictionaries\Dict-byword_Halliday_Winesearcher_Wine_AU-only_completed_rows"
DATAINFO_PATH = 'G:/PythonProjects/WineRecognition2/data_info.json'
DEVICE = 'cuda'
MODEL_NAME = "CRF_with_LSTM_features"
ALGORITHM = 'lbfgs'
C1 = 0.1
C2 = 0.1
MAX_ITERATIONS = 5
ALL_POSSIBLE_TRANSITIONS = True
TEST_SIZE = 0.2
RUN_NAME = 'Train-LSTMfeatures'
OUTPUT_DIR = r"G:\PythonProjects\WineRecognition2\artifacts\train\test_exp"
START_TIME = ''
USE_NUM2WORDS = True

Wall time: 0 ns


In [3]:
%%time
model = torch.load(LSTM_MODEL_PATH).to(DEVICE).eval()
freq_dict = DataLoader.load_frequency_dictionary(DICTIONARY_PATH, to_lowercase=True)

with open(VOCAB_PATH, encoding='utf-8') as file:
    word_to_ix = json.load(file)

with open(DATAINFO_PATH) as file:
    keys = json.load(file)['keys']['all']
    
tag_to_ix = generate_tag_to_ix(keys)
ix_to_tag = {value: key for key, value in tag_to_ix.items()}

tag_to_ix

Wall time: 1.67 s


{'Add_TradeName': 0,
 'Add_Brand': 1,
 'Add_KeyWordTrue': 2,
 'Add_KeyWordFalse': 3,
 'Add_GrapeVarieties': 4,
 'Add_GeoIndication': 5,
 'Add_WineType': 6,
 'Add_BottleSize': 7,
 'Add_Sweetness': 8,
 'Add_WineColor': 9,
 'Add_ClosureType': 10,
 'Add_Certificate': 11,
 'Add_Vintage': 12,
 'Add_Price': 13,
 'Punctuation': 14,
 'Other': 15}

In [4]:
sents = DataGenerator.generate_sents2(
    open(DATASET_PATH, encoding='utf-8').read().split('\n')
)
sents = sents[:2000]
dataset = CustomDataset(
    sents, tag_to_ix, word_to_ix, case_sensitive=CASE_SENSITIVE_VOCAB, prepare_dataset=False, convert_nums2words=USE_NUM2WORDS
)

In [5]:
%%time
X_tensors = []
metadata = {'features': [], 'labels': []}

def features_with_keys(sentence):
    return [{f'A{i}': feature for i, feature in enumerate(features)} for features in sentence]

def compute_our_features(x_sent, y_sent):
    return features.sent2features(list(zip(x_sent, y_sent)), freq_dict)
            
all_our_features = Parallel(n_jobs=8)(
    delayed(compute_our_features)(x_sent, y_sent) for x_sent, y_sent in dataset.raw_data()
)  

with torch.no_grad():
    for index, (x_sent, y_sent) in enumerate(dataset.raw_data()):
        our_features = all_our_features[index]
        
        x_tensor = torch.tensor(dataset.sentence_to_indices(x_sent), dtype=torch.int64)
        X_tensors.append(x_tensor)

        final_features = model.get_lstm_features(x_tensor.to(DEVICE).unsqueeze(0))
        final_features = features_with_keys(final_features.squeeze(0).detach().cpu().numpy())

        for i in range(len(x_sent)):
            final_features[i].update(our_features[i])

        metadata['features'].append(final_features)
        metadata['labels'].append(y_sent)

labels = metadata['labels']
x = metadata['features']

len(x), len(labels)

Wall time: 1min 29s


(2000, 2000)

In [6]:
%%time
X_train, X_val, y_train, y_val, train_sents, val_sents = train_test_split(x, labels, dataset.raw_data(), test_size=TEST_SIZE)
len(X_train), len(X_val), len(y_train), len(y_val), len(train_sents), len(val_sents)

Wall time: 2 ms


(1600, 400, 1600, 400, 1600, 400)

In [7]:
%%time
model = sklearn_crfsuite.CRF(
    algorithm=ALGORITHM,
    c1=C1,
    c2=C2,
    max_iterations=MAX_ITERATIONS,
    all_possible_transitions=ALL_POSSIBLE_TRANSITIONS
)
model.fit(X_train, y_train)
eli5.show_weights(model, top=(30, 30))

Wall time: 2.5 s




From \ To,Add_BottleSize,Add_Brand,Add_Certificate,Add_ClosureType,Add_GeoIndication,Add_GrapeVarieties,Add_KeyWordFalse,Add_KeyWordTrue,Add_Price,Add_Sweetness,Add_TradeName,Add_Vintage,Add_WineColor,Add_WineType,Punctuation
Add_BottleSize,0.29,-0.068,-0.022,-0.003,-0.039,-0.048,-0.025,-0.031,-0.031,0.012,-0.04,-0.005,0.021,0.018,-0.026
Add_Brand,-0.088,0.359,-0.024,-0.05,0.025,0.13,-0.014,0.034,-0.049,-0.041,-0.197,-0.075,-0.042,-0.036,0.011
Add_Certificate,-0.024,-0.02,-0.007,-0.013,-0.011,-0.014,-0.007,-0.008,-0.01,-0.01,-0.016,-0.021,-0.01,-0.01,-0.008
Add_ClosureType,-0.004,-0.038,-0.013,0.206,-0.025,-0.044,-0.013,-0.019,-0.018,0.022,-0.025,-0.01,0.003,0.02,-0.017
Add_GeoIndication,-0.027,-0.096,-0.012,-0.011,0.176,0.082,-0.011,0.0,-0.027,0.001,-0.043,-0.029,0.004,0.006,-0.019
Add_GrapeVarieties,0.0,-0.111,-0.018,0.0,0.056,0.0,-0.015,0.035,-0.049,0.049,-0.048,-0.042,0.044,0.053,0.126
Add_KeyWordFalse,-0.021,-0.021,-0.007,-0.013,-0.003,-0.009,-0.007,-0.006,-0.015,-0.01,-0.016,-0.023,-0.011,-0.009,-0.008
Add_KeyWordTrue,-0.017,-0.057,-0.009,-0.006,0.01,0.032,-0.009,0.22,-0.019,0.002,-0.026,-0.025,0.0,-0.0,0.04
Add_Price,-0.076,-0.046,-0.012,-0.038,-0.023,-0.025,-0.012,-0.016,0.318,-0.028,-0.034,-0.084,-0.027,-0.027,-0.014
Add_Sweetness,0.023,-0.026,-0.008,0.002,-0.019,-0.027,-0.011,-0.014,0.028,-0.023,-0.019,0.002,0.057,0.053,-0.012

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6,Unnamed: 14_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7,Unnamed: 14_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9,Unnamed: 13_level_9,Unnamed: 14_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10,Unnamed: 13_level_10,Unnamed: 14_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11,Unnamed: 13_level_11,Unnamed: 14_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12,Unnamed: 13_level_12,Unnamed: 14_level_12
Weight?,Feature,Unnamed: 2_level_13,Unnamed: 3_level_13,Unnamed: 4_level_13,Unnamed: 5_level_13,Unnamed: 6_level_13,Unnamed: 7_level_13,Unnamed: 8_level_13,Unnamed: 9_level_13,Unnamed: 10_level_13,Unnamed: 11_level_13,Unnamed: 12_level_13,Unnamed: 13_level_13,Unnamed: 14_level_13
Weight?,Feature,Unnamed: 2_level_14,Unnamed: 3_level_14,Unnamed: 4_level_14,Unnamed: 5_level_14,Unnamed: 6_level_14,Unnamed: 7_level_14,Unnamed: 8_level_14,Unnamed: 9_level_14,Unnamed: 10_level_14,Unnamed: 11_level_14,Unnamed: 12_level_14,Unnamed: 13_level_14,Unnamed: 14_level_14
+0.517,A56,,,,,,,,,,,,,
+0.442,A15,,,,,,,,,,,,,
+0.383,A95,,,,,,,,,,,,,
+0.372,A2,,,,,,,,,,,,,
+0.371,A111,,,,,,,,,,,,,
+0.367,A70,,,,,,,,,,,,,
+0.349,A6,,,,,,,,,,,,,
+0.345,A4,,,,,,,,,,,,,
+0.341,A17,,,,,,,,,,,,,
+0.306,A71,,,,,,,,,,,,,

Weight?,Feature
+0.517,A56
+0.442,A15
+0.383,A95
+0.372,A2
+0.371,A111
+0.367,A70
+0.349,A6
+0.345,A4
+0.341,A17
+0.306,A71

Weight?,Feature
+0.595,A18
+0.428,A51
+0.406,A40
+0.379,A52
+0.370,A10
+0.346,A20
+0.332,A58
+0.317,A17
+0.312,A66
+0.304,A105

Weight?,Feature
+0.135,A63
+0.101,A96
+0.098,A81
+0.084,A110
+0.078,A90
+0.069,A39
+0.068,A74
+0.066,A68
+0.058,A120
+0.058,A35

Weight?,Feature
+0.353,A90
+0.335,Add_ClosureType
+0.334,A85
+0.323,A54
+0.295,A74
+0.293,A13
+0.290,A35
+0.272,A126
+0.249,A81
+0.232,A103

Weight?,Feature
+0.463,Add_GeoIndication
+0.372,A68
+0.352,A118
+0.307,A9
+0.296,A19
+0.292,A27
+0.277,A108
+0.266,A61
+0.263,A97
+0.255,A85

Weight?,Feature
+0.419,Add_GrapeVarieties
+0.407,A103
+0.398,A104
+0.392,A35
+0.385,A105
+0.365,A36
+0.354,A33
+0.307,A44
+0.291,A102
+0.273,A52

Weight?,Feature
+0.093,A104
+0.073,A68
+0.070,A120
+0.069,A36
+0.053,A1
+0.046,A84
+0.045,A6
+0.044,A2
+0.042,A105
+0.042,A51

Weight?,Feature
+0.321,A16
+0.311,A37
+0.300,A62
+0.281,A99
+0.268,A127
+0.263,A114
+0.260,A66
+0.254,A29
+0.249,A53
+0.242,A28

Weight?,Feature
+0.491,A46
+0.387,A62
+0.375,A50
+0.368,A54
+0.351,A45
+0.350,A11
+0.343,A29
+0.342,A30
+0.339,A20
+0.301,A14

Weight?,Feature
+0.310,A90
+0.297,A118
+0.288,Add_Sweetness
+0.287,word.lower():dry
+0.287,word[-3:]:dry
+0.281,A27
+0.276,A49
+0.275,A95
+0.253,A21
+0.244,A72

Weight?,Feature
+0.548,A65
+0.541,A96
+0.404,A48
+0.390,A1
+0.342,A11
+0.341,A27
+0.320,A41
+0.319,A26
+0.316,A87
+0.315,A18

Weight?,Feature
+0.482,A15
+0.472,A34
+0.414,A49
+0.404,A59
+0.401,A26
+0.393,A41
+0.388,A9
+0.386,A44
+0.347,A8
+0.334,A1

Weight?,Feature
+0.339,Add_WineColor
+0.337,A103
+0.284,A68
+0.282,A56
+0.258,A87
+0.250,A92
+0.239,A67
+0.239,A3
+0.233,A94
+0.231,A102

Weight?,Feature
+0.396,A63
+0.308,A33
+0.289,Add_WineType
+0.287,word.lower():still
+0.286,word[-3:]:ill
+0.273,A48
+0.267,A38
+0.252,A85
+0.252,A1
+0.243,A54

Weight?,Feature
+0.299,A74
+0.283,A122
+0.274,A52
+0.260,A101
+0.255,A39
+0.254,A84
+0.249,A94
+0.248,A57
+0.239,A111
+0.229,A104


In [None]:
%%time
y_pred = model.predict(X_val)
marginals = model.predict_marginals(X_val)

In [None]:
unk_foreach_tag = count_unk_foreach_tag(X_tensors, labels, keys, dataset.word_to_ix[dataset.unk])

In [None]:
confs = compute_model_confidence(marginals)
prob_table = DataGenerator.generate_probability_table(marginals, val_sents)

In [None]:
%%time
test_eval = [list(zip(sentence, tags, y_pred[index])) for index, (sentence, tags) in enumerate(val_sents)]

In [None]:
%%time
run_params = {
    'dataset_path': DATASET_PATH,
    'lstm_model_path': LSTM_MODEL_PATH,
    'vocab_path': VOCAB_PATH,
    'case_sensitive_vocab': CASE_SENSITIVE_VOCAB,
    'dictionary_path': DICTIONARY_PATH,
    'datainfo_path': DATAINFO_PATH,
    'device': DEVICE,
    'model_name': MODEL_NAME,
    'algorithm': ALGORITHM,
    'c1': C1,
    'c2': C2 ,
    'max_iterations': MAX_ITERATIONS,
    'all_possible_transitions': ALL_POSSIBLE_TRANSITIONS,
    'test_size': TEST_SIZE,
    'runname': RUN_NAME,
    'start_time': START_TIME,
    'output_dir': OUTPUT_DIR,
    'models_confidence': np.mean(confs),
    'unk_foreach_tag': json.dumps(unk_foreach_tag),
    'prob_table': prob_table,
    'use_num2words': USE_NUM2WORDS
}

In [None]:
%%time
log_mlflow_on_train(
    run_params=run_params,
    model=model,
    y_true=y_val,
    y_pred=y_pred,
    test_eval=test_eval
)