## Load Dataset: Cleaned LST20 Corpus

In [None]:
!nvidia-smi

In [None]:
import pandas as pd

df_original = pd.read_csv('clean_train.csv')
df_original.dropna(inplace=True)
df_base = df_original[['text', 'POS']]
df_base.iloc[:10]

## Define POS Tags and their conversions

In [None]:
POS_TAGS = ["NN", "VV", "PU", "CC", "PS", "AX", "AV", "FX", "NU", "AJ", "CL", "PR", "NG", "PA", "XX", "IJ"]

def encode_label(text): return POS_TAGS.index(text)
def decode_label(num): return POS_TAGS[num] 

## Split into train/val/test sets

In [None]:
# Do not shuffle to preserve contexts
from sklearn.model_selection import train_test_split

size_base = len(df_base)
size_train = int(0.8*size_base)
size_val = int(0.1*size_base)
size_test = size_base - (size_train+size_val)

df_train, df_test = train_test_split(df_base, train_size=size_train+size_val, test_size=size_test, shuffle=False)
df_train, df_val = train_test_split(df_train, train_size=size_train, test_size=size_val, shuffle=False)

In [None]:
print(size_train, size_val, size_test)

## Import pre-trained tokenizer

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('airesearch/wangchanberta-base-att-spm-uncased')

In [None]:
from pprint import pprint

dict_word2int = tokenizer.vocab
pprint(list(dict_word2int.items())[:5])

## Play with the tokenizer a bit to see how it works

In [None]:
vocab_all = list(dict_word2int.keys())

_word_indict = "สวัสดี"
_code_indict = dict_word2int[_word_indict]
print(_word_indict, _code_indict)

_word_notindict = "สวัสดีค้าบ ท่านสมาชิกชมรมคนชอบ NLP"
# _code_notindict = dict_word2int[_word_notindict]
# print(_word_notindict, _code_notindict)

In [None]:
dict_int2word = {code: word for (word, code) in dict_word2int.items()}
_codes_nowindict_tokenized = tokenizer(_word_notindict)
_words_nowindict_tokenized = [dict_int2word[code] for code in _codes_nowindict_tokenized['input_ids']]

pprint(_codes_nowindict_tokenized)
print(_words_nowindict_tokenized)

In [None]:
# Important codes and corresponding words
print(dict_int2word[3], "\t Unknown")
print(dict_int2word[5], "\t Sentence Start")
print(dict_int2word[6], "\t Sentence End")
print(dict_int2word[10], "\t Space")

## Encode all words and tags into numbers

In [None]:
# This approach selects the "first" character representing the original word/sentence that is not in <s>, </s>, or <unk>

def encode_word(word):
    if word == " ":
        code = 10
    elif word in dict_word2int:
        code = dict_word2int[word]
    else:
        code = 3
        for i in tokenizer(word)['input_ids']:
            if i not in [5, 6, 10]:
                code = i
                break
    return code

In [None]:
train_words_payload = df_train['text'].apply(encode_word).tolist()
train_labels_payload = df_train['POS'].apply(encode_label).tolist()

val_words_payload = df_val['text'].apply(encode_word).tolist()
val_labels_payload = df_val['POS'].apply(encode_label).tolist()

test_words_payload = df_test['text'].apply(encode_word).tolist()
test_labels_payload = df_test['POS'].apply(encode_label).tolist()

In [None]:
print(df_train['text'].iloc[:5].tolist())
print(df_train['POS'].iloc[:5].tolist())
print()
print(train_words_payload[:5])
print(train_labels_payload[:5])

## Batch data

In [None]:
import numpy as np

BATCH_SIZE = 128

def batched_array(data:list):
    last_index = -1 * (len(data)%BATCH_SIZE)
    data = np.array(data)
    data = data[:last_index].reshape(-1, BATCH_SIZE)
    return data

x_train = batched_array(train_words_payload)
y_train = batched_array(train_labels_payload)
x_val = batched_array(val_words_payload)
y_val = batched_array(val_labels_payload)
x_test = batched_array(test_words_payload)
y_test = batched_array(test_labels_payload)

## Recall important quntities and hyperparameters

In [None]:
from transformers import AutoModelForTokenClassification

model_hug = AutoModelForTokenClassification.from_pretrained(
    'airesearch/wangchanberta-base-att-spm-uncased', revision='finetuned@lst20-pos'
)
word_em_initial = model_hug.roberta.embeddings.word_embeddings.weight.detach().numpy()

In [None]:
from matplotlib import pyplot as plt

pos_freqs = df_base['POS'].value_counts().loc[POS_TAGS]
pos_freqs.plot.bar()
plt.show()

print(pos_freqs)

In [None]:
# This is experimental and purely from thin air

E = 1e-12

freqs = pos_freqs.to_numpy()
freqs = freqs / np.sum(freqs)
uwu = 1- freqs
uwu = (uwu-uwu.min()+E)/(uwu.max()-uwu.min()+E)
plt.bar(list(range(len(uwu))), uwu)
plt.show()

print(uwu)

In [None]:
num_class = len(POS_TAGS)             # Number of possible tags into which a model should classify
vocab_len = word_em_initial.shape[0]  # The size of our dic* + 1
embed_dim = word_em_initial.shape[1]  # The dimension of the embedding vector

uwu_new = uwu[None, None, :]
uwu_new = np.tile(uwu_new, (128, 1))
print(uwu_new.shape)

## Create a model

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import \
Input, Embedding, Bidirectional, LSTM, TimeDistributed, Dense, Softmax

inputs = Input(shape=BATCH_SIZE, name='WordIndex')
x = Embedding(trainable=False, input_dim=vocab_len, output_dim=embed_dim, name="Embedding")(inputs)
x = Bidirectional(LSTM(256, return_sequences=True, dropout=0.2))(x)
x = TimeDistributed(Dense(128, activation='relu'), name='Dense_1')(x)
x = TimeDistributed(Dense(128, activation='relu'), name='Dense_2')(x)
x = TimeDistributed(Dense(128, activation='relu'), name='Dense_3')(x)
x = TimeDistributed(Dense(num_class, activation=None), name='Dense_Classifier')(x)
x = tf.multiply(x, uwu_new)
x = TimeDistributed(Softmax(), name='Softmax')(x)

model = tf.keras.Model(inputs, x, name="EXP_POS_TAGGER")
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [None]:
model.get_layer("Embedding").set_weights(word_em_initial.reshape(1, vocab_len, embed_dim))

## Train the model

In [None]:
ES = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, verbose=1, min_delta=0, restore_best_weights=True)

history = model.fit(x_train, y_train, epochs=100, validation_data=(x_val, y_val), callbacks=[ES], batch_size=BATCH_SIZE)

## Evaluate model

In [None]:
from matplotlib import pyplot as plt

train_loss, train_acc, eval_loss, eval_acc = list(history.history.values())
fig, axes = plt.subplots(1, 2, figsize=[15, 5])
fig.suptitle('Model History')
axes[0].set_title("Cross Entropy Loss")
axes[0].set_xlabel('epoch')
axes[0].set_ylabel('loss')
axes[0].plot(train_loss, c='b')
axes[0].plot(eval_loss, c='r')
axes[1].legend(['train', 'val'], loc='upper left')
axes[1].set_title("Accuracy")
axes[1].set_xlabel('epoch')
axes[1].set_ylabel('accuracy')
axes[1].plot(train_acc, c='b')
axes[1].plot(eval_acc, c='r')
axes[1].legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
%%time

metrics = model.evaluate(x_test, y_test)

In [None]:
print(f'loss: {metrics[0]:.4f} | acc: {metrics[1]:.4f}')

In [None]:
from sklearn.metrics import classification_report

def flatten_and_encode(l:list):
    return [decode_label(element) for batch in l for element in batch]

y_test_pred = model.predict(x_test).argmax(axis=2)
y_test_pred = flatten_and_encode(y_test_pred)
y_test_gold = flatten_and_encode(y_test)
print(classification_report(y_test_gold, y_test_pred, labels=POS_TAGS))

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cf_matrix = confusion_matrix(y_test_gold, y_test_pred, labels=POS_TAGS)
cf_heatmap = ConfusionMatrixDisplay(cf_matrix, display_labels=POS_TAGS)
cf_heatmap.plot(include_values=False, cmap='hot')
plt.show()

In [None]:
df_analysis = pd.DataFrame(
    {'text': df_test['text'].tolist()[:len(y_test_gold)], 'POS': y_test_gold, 'pred': y_test_pred}
)
df_analysis = df_analysis[df_analysis['POS']!=df_analysis['pred']]
df_analysis.info()

In [None]:
df_analysis.head()

In [None]:
analysis_freqs = df_analysis['pred'].value_counts()
analysis_freqs.plot.bar()
plt.show()

print(analysis_freqs)

In [None]:
analysis_tags = df_analysis['POS'].unique()
cf_matrix = confusion_matrix(df_analysis['POS'], df_analysis['pred'], labels=analysis_tags)
cf_heatmap = ConfusionMatrixDisplay(cf_matrix, display_labels=analysis_tags)
cf_heatmap.plot(include_values=False, cmap='hot')
plt.show()

## Test Submission

In [None]:
!pip install gdown

In [None]:
!unzip pos_test.txt.zip -d ./submission

In [None]:
with open("/kaggle/working/submission/pos_test.txt","r") as f:
  word_list_submission = []
  for line in f.readlines() :
    if line != '\n' :
      word_list_submission.append(line[:-1])
    else :
      word_list_submission.append('_')

print(len(word_list_submission))
word_list_submission = word_list_submission[:-1]
print(len(word_list_submission))

In [None]:
# Pad to retain all information
word_list_submission += ['_']*23

In [None]:
from tqdm.notebook import tqdm

word_list_submission_encoded = [encode_word(r) for r in tqdm(word_list_submission)]

In [None]:
import numpy as np

BATCH_SIZE = 128

np_word_list_submission_encoded_batched = np.array(word_list_submission_encoded).reshape(-1,BATCH_SIZE)

In [None]:
np_word_list_submission_encoded_batched.shape

In [None]:
np_word_list_submission_encoded_batched[0]

In [None]:
!gdown --id 1ailLw7SoC2mh-b9aLab4pccGnyTTO2Xa

In [None]:
import tensorflow as tf

model = tf.keras.models.load_model('/kaggle/working/exp_pos_ud_bilstm_weight_final.h5')

In [None]:
%%time
z = model.predict(np_word_list_submission_encoded_batched).argmax(axis=2)

In [None]:
func2_ = np.vectorize(lambda x : decode_label(x))
z_decode = func2_(z)
z_decode = z_decode.flatten()
z_decode = z_decode.tolist()[:-23]
print(len(z_decode))
print(z_decode[:50])

In [None]:
submission_df = pd.read_csv("pos_sample_submission.csv")

In [None]:
submission_df.Predicted = z_decode

In [None]:
submission_df.to_csv("exp_pos_ud_bilstm_weight_final.csv", index=False)

In [None]:
model.save('exp_pos_ud_bilstm_weight_final.h5')

In [None]:
submission_df.info()

In [None]:
df_analysis.head(10)

# Special: Added post-processing

In [None]:
df_post = pd.read_csv('exp_pos_ud_bilstm_weight_final.csv')

In [None]:
df_post.head()

In [None]:
df_post['word'] = word_list_submission[:-23]

In [None]:
df_post.head()

In [None]:
df_post.tail()

In [None]:
words_rule = ['เยอะ','ใหญ่','ใกล้','เล็ก','หวาน','เร็ว','มาก','สวยงาม','ช้า','นํ้าเงิน','โบราณ','จริง','ดี','เฉพาะ','อาวุโส','ฟรี','สุภาพ/','พ.ต.อ.','ทั่วถึง','วิกฤต','เก่ง','เจ๋ง','มืด']
tags_rule = ['AJ','AJ','PS','AJ','AJ','AV','AV','AV','AV','AJ','AJ','AV','AV','AV','AJ','AJ','AV','NN','AV','AJ','AJ','AJ','AJ']
for i in range(len(words_rule)) :
  df_post.loc[df_post['word'] == words_rule[i],'Predicted'] = tags_rule[i]

In [None]:
df_post.head(5)

In [None]:
df_sub = df_post[['Id', 'Predicted']]

In [None]:
df_sub.tail()

In [None]:
df_sub.to_csv('exp_pos_ud_bilstm_weight_post_final_finally.csv', index=False)