In [1]:
%%time
import sys
if not r'G:\PythonProjects\WineRecognition2' in sys.path:
    sys.path.insert(0, r'G:\PythonProjects\WineRecognition2')

import os
import json
import torch
import numpy as np
import pandas as pd
import sklearn_crfsuite
import eli5
from sklearn.model_selection import train_test_split

from data_master import DataGenerator, DataLoader, count_unk_foreach_tag, compute_model_confidence
from nn.utils import CustomDataset, generate_tag_to_ix
from mlflow_utils import log_mlflow_on_train
from features import features

Wall time: 2.45 s


In [None]:
%%time
DATASET_PATH = r"G:\PythonProjects\WineRecognition2\data\text\Halliday_Wine_AU-only_completed_rows-complex.txt"
LSTM_MODEL_PATH = r"G:\PythonProjects\WineRecognition2\artifacts\train\BiLSTM_CRF_10112021_030733\model\data\model.pth"
VOCAB_PATH = r"G:\PythonProjects\WineRecognition2\data\vocabs\Words_Halliday_Wine_AU.json"
CASE_SENSITIVE_VOCAB = False
DICTIONARY_PATH = r"G:\PythonProjects\WineRecognition2\data\dictionaries\Dict-byword_Halliday_Winesearcher_Wine_AU-only_completed_rows"
DATAINFO_PATH = 'G:/PythonProjects/WineRecognition2/data_info.json'
DEVICE = 'cuda'
MODEL_NAME = "CRF_with_LSTM_features"
ALGORITHM = 'lbfgs'
C1 = 0.1
C2 = 0.1
MAX_ITERATIONS = 5
ALL_POSSIBLE_TRANSITIONS = True
TEST_SIZE = 0.2
RUN_NAME = 'Train-LSTMfeatures'
OUTPUT_DIR = r"G:\PythonProjects\WineRecognition2\artifacts\train\test_exp"
START_TIME = ''
USE_NUM2WORDS = False

In [None]:
%%time
model = torch.load(LSTM_MODEL_PATH).to(DEVICE).eval()
freq_dict = DataLoader.load_frequency_dictionary(DICTIONARY_PATH, to_lowercase=True)

with open(VOCAB_PATH, encoding='utf-8') as file:
    word_to_ix = json.load(file)

with open(DATAINFO_PATH) as file:
    keys = json.load(file)['keys']['all']
    
tag_to_ix = generate_tag_to_ix(keys)
ix_to_tag = {value: key for key, value in tag_to_ix.items()}

tag_to_ix

In [None]:
sents = DataGenerator.generate_sents2(
    open(DATASET_PATH, encoding='utf-8').read().split('\n')
)
# sents = sents[:15]
dataset = CustomDataset(
    sents, tag_to_ix, word_to_ix, case_sensitive=CASE_SENSITIVE_VOCAB, prepare_dataset=False, convert_nums2words=USE_NUM2WORDS
)

In [None]:
%%time
def features_with_keys(sentence):
    return [{f'A{i}': feature for i, feature in enumerate(features)} for features in sentence]

X_tensors = []
metadata = {'features': [], 'labels': []}

with torch.no_grad():
    for x_sent, y_sent in dataset.raw_data():
        our_features = features.sent2features(list(zip(x_sent, y_sent)), freq_dict)
            
        x_tensor = torch.tensor(dataset.sentence_to_indices(x_sent), dtype=torch.int64)
        X_tensors.append(x_tensor)
        
        final_features = model.get_lstm_features(x_tensor.to(DEVICE).unsqueeze(0))
        final_features = features_with_keys(final_features.squeeze(0).detach().cpu().numpy())
        
        for i in range(len(x_sent)):
            final_features[i].update(our_features[i])
            
        metadata['features'].append(final_features)
        metadata['labels'].append(y_sent)
        
labels = metadata['labels']
x = metadata['features']

len(x), len(labels)

In [None]:
%%time
X_train, X_val, y_train, y_val, train_sents, val_sents = train_test_split(x, labels, dataset.raw_data(), test_size=TEST_SIZE)
len(X_train), len(X_val), len(y_train), len(y_val), len(train_sents), len(val_sents)

In [None]:
%%time
model = sklearn_crfsuite.CRF(
    algorithm=ALGORITHM,
    c1=C1,
    c2=C2,
    max_iterations=MAX_ITERATIONS,
    all_possible_transitions=ALL_POSSIBLE_TRANSITIONS
)
model.fit(X_train, y_train)
eli5.show_weights(model, top=(30, 30))

In [None]:
%%time
y_pred = model.predict(X_val)
marginals = model.predict_marginals(X_val)

In [None]:
unk_foreach_tag = count_unk_foreach_tag(X_tensors, labels, keys, dataset.word_to_ix[dataset.unk])

In [None]:
confs = compute_model_confidence(marginals)
prob_table = DataGenerator.generate_probability_table(marginals, val_sents)

In [None]:
%%time
test_eval = [list(zip(sentence, tags, y_pred[index])) for index, (sentence, tags) in enumerate(val_sents)]

In [None]:
%%time
run_params = {
    'dataset_path': DATASET_PATH,
    'lstm_model_path': LSTM_MODEL_PATH,
    'vocab_path': VOCAB_PATH,
    'case_sensitive_vocab': CASE_SENSITIVE_VOCAB,
    'dictionary_path': DICTIONARY_PATH,
    'datainfo_path': DATAINFO_PATH,
    'device': DEVICE,
    'model_name': MODEL_NAME,
    'algorithm': ALGORITHM,
    'c1': C1,
    'c2': C2 ,
    'max_iterations': MAX_ITERATIONS,
    'all_possible_transitions': ALL_POSSIBLE_TRANSITIONS,
    'test_size': TEST_SIZE,
    'runname': RUN_NAME,
    'start_time': START_TIME,
    'output_dir': OUTPUT_DIR,
    'models_confidence': np.mean(confs),
    'unk_foreach_tag': json.dumps(unk_foreach_tag),
    'prob_table': prob_table,
    'use_num2words': USE_NUM2WORDS
}

In [None]:
%%time
log_mlflow_on_train(
    run_params=run_params,
    model=model,
    y_true=y_val,
    y_pred=y_pred,
    test_eval=test_eval
)