# Set up

### > Install and import libraries

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

# Since this version of BERT was released before tensorflow 2, 
# so make sure to use tensorflow-gpu 1.x (took so much longer if run on CPU)
os.system('pip install tensorflow-gpu==1.15')

# Install sentencepiece >> used for tokenizing Thai senetences
os.system('pip install sentencepiece')

# Install gdown for downloading files from google drive
os.system('pip install gdown')

### > Set GPU

In [None]:
# BERT requires high computation and memory so using free powerful GPU on Colab is going to save a lot of time and money
# To enable GPU accelerator: Runtime > change runtime type > under 'hardware accelerator', select GPU > SAVE
# Then Colab will reconnect and use this code to check if GPU is enabled. 
os.system('nvidia-smi')

# Download required files

### > Thai Pretrained BERT model - bert_base_th (ThAIKeras)

In [None]:
os.system('gdown --id 1J3uuXZr_Se_XIFHj7zlTJ-C9wzI9W_ot') # use only id in the link https://drive.google.com/uc?id=1J3uuXZr_Se_XIFHj7zlTJ-C9wzI9W_ot

In [None]:
os.system('unzip bert_base_th.zip')

### > th_wiki_bpe

In [None]:
os.system('gdown --id 1F7pCgt3vPlarI9RxKtOZUrC_67KMNQ1W')

In [None]:
os.mkdir('th_wiki_bpe')
os.system('unzip th_wiki_bpe.zip -d /content/th_wiki_bpe')

### > Truevoice Intent dataset

In [None]:
os.system('git clone https://github.com/PyThaiNLP/truevoice-intent.git')
os.chdir('truevoice-intent')
os.system('unzip mari-intent.zip')
os.chdir('..')

### > BERT classifier finetuner modified for Thai
https://github.com/KongpolC/bert

In [None]:
os.system('git clone https://github.com/KongpolC/bert.git')

Now you should have these folders in the directory
```
.
|-- bert
|-- bert_base_th
|-- th_wiki_bpe
|-- truevoice-intent
```

# Finetune the model

In [None]:
# Create 'model' directory to store the finetuned model
os.mkdir('model')

In [None]:
# Declare path to parse when finetuning
os.environ['BPE_DIR'] = 'th_wiki_bpe' #'/content/th_wiki_bpe'
os.environ['DATA_DIR'] = 'truevoice-intent' #'/content/wongnai_data'
os.environ['OUTPUT_DIR'] = 'model' #'/content/model'
os.environ['BERT_BASE_DIR'] = 'bert_base_th' #'/content/bert_base_th'

If the dataset is larger or your GPU memory is less that it raises OOM >> decrease "train_batch_size"

In [None]:
# Run finetuning
!python bert/run_classifier.py \
  --task_name=truevoice \
  --do_train=true \
  --do_eval=true \
  --do_predict=true \
  --data_dir=$DATA_DIR \
  --vocab_file=$BPE_DIR/th.wiki.bpe.op25000.vocab \
  --bert_config_file=$BERT_BASE_DIR/bert_config.json \
  --init_checkpoint=$BERT_BASE_DIR/model.ckpt \
  --max_seq_length=128 \
  --train_batch_size=32 \
  --learning_rate=5e-5 \
  --num_train_epochs=2.0 \
  --output_dir=$OUTPUT_DIR \
  --spm_file=$BPE_DIR/th.wiki.bpe.op25000.model

Colab runtime often disconnect and all created files will be lost. It's a good idea to  download prediction results and the latest model automatically once training is completed.

In [None]:
# Load model and result in case colab reconnect and lost the files
from google.colab import files

# Download test results
files.download('model/test_results.tsv')

# Download model with highest index
dir = [i for i in os.listdir('model') if 'model.ckpt-' in i]
step = 0
for name in dir:
  new_step = int(name.split('.')[1][5:])
  if new_step > step:
    step = new_step
for name in dir:
  if 'model.ckpt-' + str(step) in name:
    print('downloading ' + name)
    files.download('model/' + name)

# Test Accuracy
You could test its accuracy by setting do_eval=true when finetuning. Here, I added a multi-class confusion matrix to give more information about the prediction in order to tune the model or come up with improvement strategies.

In [None]:
def plot_confusion_matrix(y_true, y_pred, classes, destination=None,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    #cm = np.array([[ 11, 2 , 40 ,  2], [  0  , 0 ,  0  , 0], [ 37  , 8 ,114  , 4], [ 11 ,  0  ,33 , 1]])
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    #classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=0, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    #plt.show()
    try:
        plt.savefig(destination)
    except:
        pass
    
    return ax

In [None]:
def compare_answer(row):
  '''
  Compares 2 columns of actual and predicted class. 
  
  Returns:
  -- 1 if they are the same
  -- 0 otherwise.
  '''
  
  if row['prediction'] == row['actual']:
    return 1
  else:
    return 0

In [None]:
def benchmark(actual_path, pred_path):
  '''
  Calculates model accuracy and confusion matrix

  Arguments:
  actual_path -- path of test file which contains actual labels.
  pred_path -- path of prediction file which contains predicted labels.

  Returns:
  accuracy -- accuracy of the prediction
  confusion matrix -- save as image file
  '''

  # Read files to dataframes
  actual = pd.read_csv(actual_path, sep=',')
  pred = pd.read_csv(pred_path, sep='\t')

  # Create result dataframe
  result = pred.copy()
  result['prediction'] = result.idxmax(axis=1)
  result['actual'] = actual['destination']
  result['correct'] = result.apply(lambda row: compare_answer(row), axis=1)

  # Calculate accuracy
  accuracy = sum(result['correct'])/len(result)

  # Confusion Matrix
  confusion_matrix = plot_confusion_matrix(result['actual'], result['prediction'], classes=['billing and payment', 'promotions', 'internet', 'other queries', 'international dialing', 'true money', 'lost and stolen'], destination=None, normalize=False, title='Confusion Matrix')

  return accuracy, result

In [None]:
# Benchmark accuracy
actual_path = 'truevoice-intent/mari_test.csv'
pred_path = 'model/test_results.tsv'
accuracy, result = benchmark(actual_path, pred_path)