In [None]:
%load_ext autoreload
%autoreload 2

from bailarn.utils import utils

# Define tokenizer and word_embedder

In [None]:
from bailarn.tokenizer import constant as tokenizer_constant
from bailarn.tokenizer.tokenizer import Tokenizer

# Create index for character and tag
char_index = utils.build_tag_index(tokenizer_constant.CHARACTER_LIST, tokenizer_constant.CHAR_START_INDEX)
tag_index = utils.build_tag_index(tokenizer_constant.TAG_LIST, tokenizer_constant.TAG_START_INDEX)

tokenizer_model = Tokenizer(char_index, tag_index)

In [None]:
def tokenize_func(sentence):
    return tokenizer_model.predict(sentence)
tokenize_func("ฉันกินข้าว")

In [None]:
from bailarn.word_embedder.word2vec import Word2Vec
w2v_model = Word2Vec()

w2v_vocab = w2v_model.model.wv.vocab
# example of w2v vocab
sorted(w2v_vocab.items(), key=lambda x:x[1], reverse=False)[:10]

# Load text collection

In [None]:
texts = utils.TextCollection(corpus_directory="./data/pantip/mobile/corpus", tokenize_function=tokenize_func)
# test_texts = utils.TextCollection(corpus_directory="./data/pantip/mobile/test_corpus", tokenize_function=tokenize_func)

# Create word_index

In [None]:
# Build word_index
word_index = utils.build_word_index(texts, word2vec_vocab=w2v_vocab)
len(word_index)

# Create embedding_matrix

In [None]:
# Build embedding_matrix
embedding_matrix = utils.get_embedding_matrix(word2vec_model=w2v_model, word_index=word_index, fasttext=False)
len(embedding_matrix)

# Create tag_index

In [None]:
from bailarn.categorization import constant as categorization_constant
from bailarn.categorization.categorization import Categorization

categorization_tag_index = utils.build_tag_index(categorization_constant.TAG_LIST, categorization_constant.TAG_START_INDEX)

# Build input

In [None]:
vs = utils.build_input(texts,
                       word_index,
                       categorization_tag_index,
                       categorization_constant.SEQUENCE_LENGTH,
                       target='categorization')

In [None]:
vs.x[:1]

In [None]:
vs.y[:1]

# Train new model without pre-train embedding matrix

In [None]:
new_categorization_model = Categorization(new_model=True)

In [None]:
new_categorization_model.train(X_train=vs.x, y_train=vs.y,
                               batch_size=100, validate_ratio=0.1,sensitive_learning=False)

In [None]:
new_categorization_model.predict(vs.x[:1], decode_tag=False)

In [None]:
new_categorization_model.predict(vs.x[:1], threshold_selection=0.1, decode_tag=True)

In [None]:
scores = new_categorization_model.evaluate(vs.x, vs.y, threshold_selection=0.5)

In [None]:
scores = new_categorization_model.evaluate(vs.x, vs.y)

# Train new model with pre-train embedding matrix

In [None]:
categorization_model = Categorization(embedding_matrix=embedding_matrix, new_model=True)

In [None]:
# Show defined embedding weight
# check if it equals the input embedding matrix
categorization_model.model.get_weights()[0]

In [None]:
embedding_matrix

In [None]:
categorization_model.train(X_train=vs.x, y_train=vs.y, batch_size=300, validate_ratio=0.2, sensitive_learning=False)

In [None]:
categorization_model.predict(vs.x[:1], decode_tag=False)

In [None]:
categorization_model.predict(vs.x[:1], threshold_selection=0.1, decode_tag=True)

In [None]:
scores = categorization_model.evaluate(vs.x, vs.y)

In [None]:
categorization_model.save(filepath="./bailarn/categorization/models/mock_model.h5")

# Load trained model

In [None]:
loaded_categorization = Categorization(model_path="./bailarn/categorization/models/mock_model.h5", new_model=False)

In [None]:
loaded_categorization.model.get_weights()[0]

In [None]:
categorization_model.model.get_weights()[0]

In [None]:
loaded_categorization.predict(vs.x, decode_tag=False)

In [None]:
scores = loaded_categorization.evaluate(vs.x, vs.y)

# Default model

In [None]:
import json

word_index = json.load(open('./bailarn/categorization/categorization_word_index.json'))
embedding_matrix = utils.get_embedding_matrix(word2vec_model=w2v_model, word_index=word_index)

## Load test texts from 100,000 pantip data (can be skipped)
---

In [None]:
# # Load test texts from 100,000 pantip data
# import pickle
# texts = {}  # scores is an empty dict already
# if os.path.getsize("texts_for_test.p") > 0:
#     with open("texts_for_test.p", "rb") as f:
#         unpickler = pickle.Unpickler(f)
#         # if file is not empty scores will be equal
#         # to the value unpickled
#         texts = unpickler.load()

In [None]:
# vs = utils.build_input(texts,
#                        word_index,
#                        categorization_tag_index,
#                        categorization_constant.SEQUENCE_LENGTH,
#                        target='categorization')

---

In [None]:
categorization = Categorization()

In [None]:
# embedding matrix is same shape but not same weights because of training process
categorization.model.get_weights()[0].shape

In [None]:
embedding_matrix.shape

In [None]:
# Have trained
categorization.model.get_weights()[0]

In [None]:
embedding_matrix

In [None]:
categorization.predict(vs.x[:1], threshold_selection=0.1, decode_tag=True)

In [None]:
scores = categorization.evaluate(vs.x, vs.y, threshold_selection=0.1)

In [None]:
scores = categorization.evaluate(vs.x, vs.y)

# Others
## - Create the best threshold selection manually

In [None]:
# from bailarn.utils import utils
# from bailarn.tokenizer import constant as tokenizer_constant
# from bailarn.tokenizer.tokenizer import Tokenizer
# from bailarn.word_embedder.word2vec import Word2Vec
# from bailarn.categorization import constant as categorization_constant
# from bailarn.categorization.categorization import Categorization
# import pickle
# import json
# import numpy as np
# import os
# from sklearn.metrics import precision_recall_fscore_support
# from tqdm import tqdm

In [None]:
# tag_index = utils.build_tag_index(
#     categorization_constant.TAG_LIST, categorization_constant.TAG_START_INDEX)
# word_index = json.load(open('./bailarn/categorization/word_index_fasttext.json')) 
# len(word_index)

In [None]:
# texts = {}  # scores is an empty dict already
# if os.path.getsize("texts_for_test.p") > 0:
#     with open("texts_for_test.p", "rb") as f:
#         unpickler = pickle.Unpickler(f)
#         # if file is not empty scores will be equal
#         # to the value unpickled
#         texts = unpickler.load()

In [None]:
# vs = utils.build_input(texts,
#                        word_index,
#                        tag_index,
#                        categorization_constant.SEQUENCE_LENGTH,
#                        target='categorization')

In [None]:
# categorization_model = Categorization(
#     tag_index=tag_index, model_path="./bailarn/categorization/models/cnn_xmtc_fasttext_model.h5")

In [None]:
# y_pred = categorization_model.predict(vs.x, decode_tag=False)
# y_pred[:2]

In [None]:
# threshold_selection_dict = dict((idx, []) for idx in range(
#     len(categorization_constant.TAG_LIST)))
# for class_idx in tqdm(range(len(categorization_constant.TAG_LIST))):
#     label = categorization_constant.TAG_LIST[class_idx]
#     for threshold in np.arange(0, 1.0, 0.005):

#         y_pred_class = []
#         for single_y_pred in y_pred:
#             y_pred_class.append(single_y_pred[class_idx])
#         y_pred_class = np.array(
#             [pred >= threshold for pred in y_pred_class], dtype=np.bool_)

#         y_true_class = []
#         for single_y_true in vs.y:
#             y_true_class.append(single_y_true[class_idx])
#         y_true_class = np.array(y_true_class, dtype=np.bool_)
#         _, _, f1_score, _ = precision_recall_fscore_support(
#             y_true_class, y_pred_class, average='binary')

#         threshold_selection_dict[class_idx].append(f1_score)

In [None]:
# threshold_selection_dict

In [None]:
# # Select max threshold
# write_threshold_selection_dict = {}
# for class_idx in range(len(categorization_constant.TAG_LIST)):
#     max_idx = 0
#     max_value = 0
#     for idx, value in enumerate(threshold_selection_dict[class_idx]):
#         if (value > max_value) & (idx != 0):
#             max_idx = idx
#             max_value = value
#     write_threshold_selection_dict['class_{}'.format(class_idx)] = np.arange(0, 1.0, 0.005)[max_idx]

In [None]:
# write_threshold_selection_dict

In [None]:
# with open('./cnn_xmtc_fasttext_threshold_selection.json', 'w', encoding='utf-8') as outfile:
#     json.dump(write_threshold_selection_dict, outfile)