# Finetuning ID -> SU

In [1]:
import os, sys
sys.path.append('../')
os.chdir('../')

import torch
import shutil
import random
import numpy as np
import pandas as pd
from torch import optim
from transformers import BartModel, GPT2LMHeadModel

from indobenchmark import IndoNLGTokenizer
from utils.train_eval import train, evaluate
from utils.metrics import generation_metrics_fn
from utils.forward_fn import forward_generation
from utils.data_utils import MachineTranslationDataset, GenerationDataLoader

2021-10-17 20:23:43.478163: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [2]:
###
# common functions
###
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())
    
# Set random seed
set_seed(26092020)

# Load Model

In [3]:
# bart_model = BartModel.from_pretrained('indobenchmark/indobart')
gpt_model = GPT2LMHeadModel.from_pretrained('indobenchmark/indogpt')
tokenizer = IndoNLGTokenizer.from_pretrained('indobenchmark/indogpt')
model = gpt_model

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


In [4]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(40005, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): Laye

In [5]:
count_param(model)

116566272

# Prepare Dataset

In [6]:
# configs and args

lr = 1e-4
gamma = 0.9
lower = True
step_size = 1
beam_size = 5
max_norm = 10
early_stop = 5
device = 'cuda0'
max_seq_len = 512
grad_accumulate = 1
no_special_token = False
swap_source_target = True
model_type = 'indo-gpt2' #or indo-bart
valid_criterion = 'SacreBLEU'

separator_id = 4
speaker_1_id = 5
speaker_2_id = 6

train_batch_size = 8
valid_batch_size = 8
test_batch_size = 8

if 'bart' in model_type:
    source_lang = "id_ID"
    target_lang = "su_SU"
elif 'gpt' in model_type:
    source_lang = "[indonesian]"
    target_lang = "[sundanese]"

optimizer = optim.Adam(model.parameters(), lr=lr)
src_lid = tokenizer.special_tokens_to_ids[source_lang]
tgt_lid = tokenizer.special_tokens_to_ids[target_lang]
tokenizer.bos_token = target_lang
model.config.decoder_start_token_id = tgt_lid

# Make sure cuda is deterministic
torch.backends.cudnn.deterministic = True

# create directory
model_dir = './save/MT_SUNIBS_INZNTV/example_id_su'
if not os.path.exists(model_dir):
    os.makedirs(model_dir, exist_ok=True)

# set a specific cuda device
if "cuda" in device:
    torch.cuda.set_device(int(device[4:]))
    device = "cuda"
    model = model.cuda()

In [7]:
train_dataset_path = './dataset/MT_SUNIBS_INZNTV/train_preprocess.json'
valid_dataset_path = './dataset/MT_SUNIBS_INZNTV/valid_preprocess.json'
test_dataset_path = './dataset/MT_SUNIBS_INZNTV/test_preprocess.json'

train_dataset = MachineTranslationDataset(train_dataset_path, tokenizer, lowercase=lower, no_special_token=no_special_token, 
                                            speaker_1_id=speaker_1_id, speaker_2_id=speaker_2_id, separator_id=separator_id,
                                            max_token_length=max_seq_len, swap_source_target=swap_source_target)
valid_dataset = MachineTranslationDataset(valid_dataset_path, tokenizer, lowercase=lower, no_special_token=no_special_token, 
                                            speaker_1_id=speaker_1_id, speaker_2_id=speaker_2_id, separator_id=separator_id,
                                            max_token_length=max_seq_len, swap_source_target=swap_source_target)
test_dataset = MachineTranslationDataset(test_dataset_path, tokenizer, lowercase=lower, no_special_token=no_special_token, 
                                            speaker_1_id=speaker_1_id, speaker_2_id=speaker_2_id, separator_id=separator_id,
                                            max_token_length=max_seq_len, swap_source_target=swap_source_target)

train_loader = GenerationDataLoader(dataset=train_dataset, model_type=model_type, tokenizer=tokenizer, max_seq_len=max_seq_len, 
                                    batch_size=train_batch_size, src_lid_token_id=src_lid, tgt_lid_token_id=tgt_lid, num_workers=8, shuffle=True)  
valid_loader = GenerationDataLoader(dataset=valid_dataset, model_type=model_type, tokenizer=tokenizer, max_seq_len=max_seq_len, 
                                    batch_size=valid_batch_size, src_lid_token_id=src_lid, tgt_lid_token_id=tgt_lid, num_workers=8, shuffle=False)
test_loader = GenerationDataLoader(dataset=test_dataset, model_type=model_type, tokenizer=tokenizer, max_seq_len=max_seq_len, 
                                   batch_size=test_batch_size, src_lid_token_id=src_lid, tgt_lid_token_id=tgt_lid, num_workers=8, shuffle=False)

# Test model to generate sequences

In [8]:
gpt_input = torch.LongTensor([tokenizer.encode('<s> aku adalah anak', add_special_tokens=False)])
gpt_out = gpt_model.generate(gpt_input)
tokenizer.decode(gpt_out[0])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'<s> aku adalah anak pertama dari tiga bersaudara. </s> aku lahir di kota kecil yang sama dengan ayahku.'

In [9]:
gpt_input = torch.LongTensor([tokenizer.encode('<s> hai, bagaimana kabar', add_special_tokens=False)])
gpt_out = gpt_model.generate(gpt_input)
tokenizer.decode(gpt_out[0])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'<s> hai, bagaimana kabar kalian? semoga sehat selalu ya. kali ini saya akan membahas tentang cara membuat'

# Test model to translate

In [12]:
test_loss, test_metrics, test_hyp, test_label = evaluate(model, data_loader=test_loader, forward_fn=forward_generation, 
                                                         metrics_fn=generation_metrics_fn, model_type=model_type, 
                                                         tokenizer=tokenizer, beam_size=beam_size, 
                                                         max_seq_len=max_seq_len, is_test=True, 
                                                         device='cuda')

TESTING... : 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 150/150 [28:08<00:00, 11.25s/it]


In [13]:
metrics_scores = []
result_dfs = []

metrics_scores.append(test_metrics)
result_dfs.append(pd.DataFrame({
    'hyp': test_hyp, 
    'label': test_label
}))

result_df = pd.concat(result_dfs)
metric_df = pd.DataFrame.from_records(metrics_scores)

print('== Prediction Result ==')
print(result_df.head())
print()

print('== Model Performance ==')
print(metric_df.describe())

result_df.to_csv(model_dir + "/prediction_result.csv")
metric_df.describe().to_csv(model_dir + "/evaluation_result.csv")

== Prediction Result ==
                                                 hyp  \
0                                                      
1                                                      
2                                                      
3                                                  .   
4  ilah apa yang dikatakan imam - im am kepala da...   

                                               label  
0  da teu terang naon - na on, tur can tangtu sim...  
1    terus kabeh dal alah ar nepi ka sare ub euh na.  
2  sabab urang mah darma dad am elan allah, anu g...  
3  pikeun anu hat ena suci mah, sagala ge suci. s...  
4  sanggeus leupas, petrus jeung yahya nep angan ...  

== Model Performance ==
           BLEU  SacreBLEU    ROUGE1    ROUGE2    ROUGEL  ROUGELsum
count  1.000000   1.000000  1.000000  1.000000  1.000000   1.000000
mean   0.183491   0.186992  0.726774  0.156468  0.702774   0.709424
std         NaN        NaN       NaN       NaN       NaN        NaN
min    0.183

# Fine Tuning & Evaluation

In [18]:
# Train

n_epochs = 10

train(model, train_loader=train_loader, valid_loader=valid_loader, optimizer=optimizer, 
      forward_fn=forward_generation, metrics_fn=generation_metrics_fn, valid_criterion=valid_criterion, 
      tokenizer=tokenizer, n_epochs=n_epochs, evaluate_every=1, early_stop=early_stop, 
      grad_accum=grad_accumulate, step_size=step_size, gamma=gamma, 
      max_norm=max_norm, model_type=model_type, beam_size=beam_size,
      max_seq_len=max_seq_len, model_dir=model_dir, exp_id=0, fp16="", device=device)

(Epoch 1) TRAIN LOSS:2.6296 LR:0.00010000: 100%|█████████████████████████████████████████████████████████████████████████████████████| 746/746 [01:56<00:00,  6.42it/s]


(Epoch 1) TRAIN LOSS:2.6296 BLEU:24.57 SacreBLEU:26.26 ROUGE1:51.63 ROUGE2:22.12 ROUGEL:47.25 ROUGELsum:47.27 LR:0.00010000


VALID LOSS:2.9800: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:04<00:00, 20.60it/s]


(Epoch 1) VALID LOSS:2.9800 BLEU:19.66 SacreBLEU:19.80 ROUGE1:47.40 ROUGE2:18.67 ROUGEL:42.88 ROUGELsum:42.88


(Epoch 2) TRAIN LOSS:1.9351 LR:0.00009000: 100%|█████████████████████████████████████████████████████████████████████████████████████| 746/746 [01:54<00:00,  6.51it/s]


(Epoch 2) TRAIN LOSS:1.9351 BLEU:31.44 SacreBLEU:32.80 ROUGE1:60.04 ROUGE2:31.40 ROUGEL:56.40 ROUGELsum:56.40 LR:0.00009000


VALID LOSS:3.0067: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:04<00:00, 20.38it/s]


(Epoch 2) VALID LOSS:3.0067 BLEU:21.01 SacreBLEU:21.15 ROUGE1:48.19 ROUGE2:20.27 ROUGEL:43.86 ROUGELsum:43.87


(Epoch 3) TRAIN LOSS:1.3435 LR:0.00008100: 100%|█████████████████████████████████████████████████████████████████████████████████████| 746/746 [01:58<00:00,  6.30it/s]


(Epoch 3) TRAIN LOSS:1.3435 BLEU:41.75 SacreBLEU:42.59 ROUGE1:69.33 ROUGE2:44.35 ROUGEL:66.75 ROUGELsum:66.75 LR:0.00008100


VALID LOSS:3.1152: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:04<00:00, 20.43it/s]


(Epoch 3) VALID LOSS:3.1152 BLEU:21.11 SacreBLEU:21.22 ROUGE1:48.97 ROUGE2:20.67 ROUGEL:44.11 ROUGELsum:44.14


(Epoch 4) TRAIN LOSS:0.8717 LR:0.00007290: 100%|█████████████████████████████████████████████████████████████████████████████████████| 746/746 [01:58<00:00,  6.29it/s]


(Epoch 4) TRAIN LOSS:0.8717 BLEU:55.12 SacreBLEU:55.45 ROUGE1:78.34 ROUGE2:59.40 ROUGEL:76.67 ROUGELsum:76.67 LR:0.00007290


VALID LOSS:3.2502: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:04<00:00, 20.53it/s]


(Epoch 4) VALID LOSS:3.2502 BLEU:21.49 SacreBLEU:21.59 ROUGE1:48.71 ROUGE2:20.89 ROUGEL:44.06 ROUGELsum:44.05


(Epoch 5) TRAIN LOSS:0.5450 LR:0.00006561: 100%|█████████████████████████████████████████████████████████████████████████████████████| 746/746 [01:58<00:00,  6.29it/s]


(Epoch 5) TRAIN LOSS:0.5450 BLEU:68.19 SacreBLEU:68.05 ROUGE1:85.50 ROUGE2:72.89 ROUGEL:84.54 ROUGELsum:84.54 LR:0.00006561


VALID LOSS:3.3822: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:04<00:00, 20.36it/s]


(Epoch 5) VALID LOSS:3.3822 BLEU:21.72 SacreBLEU:21.86 ROUGE1:48.80 ROUGE2:21.09 ROUGEL:44.35 ROUGELsum:44.35


(Epoch 6) TRAIN LOSS:0.3384 LR:0.00005905: 100%|█████████████████████████████████████████████████████████████████████████████████████| 746/746 [01:58<00:00,  6.30it/s]


(Epoch 6) TRAIN LOSS:0.3384 BLEU:78.53 SacreBLEU:78.09 ROUGE1:90.14 ROUGE2:82.37 ROUGEL:89.57 ROUGELsum:89.58 LR:0.00005905


VALID LOSS:3.4950: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:04<00:00, 20.46it/s]


(Epoch 6) VALID LOSS:3.4950 BLEU:21.83 SacreBLEU:21.97 ROUGE1:49.02 ROUGE2:21.15 ROUGEL:44.55 ROUGELsum:44.56


(Epoch 7) TRAIN LOSS:0.2170 LR:0.00005314: 100%|█████████████████████████████████████████████████████████████████████████████████████| 746/746 [01:58<00:00,  6.31it/s]


(Epoch 7) TRAIN LOSS:0.2170 BLEU:84.71 SacreBLEU:84.09 ROUGE1:92.72 ROUGE2:87.69 ROUGEL:92.29 ROUGELsum:92.29 LR:0.00005314


VALID LOSS:3.6229: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:05<00:00, 19.84it/s]


(Epoch 7) VALID LOSS:3.6229 BLEU:22.15 SacreBLEU:22.29 ROUGE1:49.01 ROUGE2:21.68 ROUGEL:44.58 ROUGELsum:44.57


(Epoch 8) TRAIN LOSS:0.1531 LR:0.00004783: 100%|█████████████████████████████████████████████████████████████████████████████████████| 746/746 [01:58<00:00,  6.31it/s]


(Epoch 8) TRAIN LOSS:0.1531 BLEU:87.97 SacreBLEU:87.26 ROUGE1:94.11 ROUGE2:90.41 ROUGEL:93.64 ROUGELsum:93.64 LR:0.00004783


VALID LOSS:3.6671: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:04<00:00, 20.59it/s]


(Epoch 8) VALID LOSS:3.6671 BLEU:22.02 SacreBLEU:22.15 ROUGE1:49.20 ROUGE2:21.56 ROUGEL:44.64 ROUGELsum:44.67
count stop: 1


(Epoch 9) TRAIN LOSS:0.1166 LR:0.00004305: 100%|█████████████████████████████████████████████████████████████████████████████████████| 746/746 [01:58<00:00,  6.28it/s]


(Epoch 9) TRAIN LOSS:0.1166 BLEU:89.53 SacreBLEU:88.69 ROUGE1:94.72 ROUGE2:91.79 ROUGEL:94.31 ROUGELsum:94.31 LR:0.00004305


VALID LOSS:3.7435: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:04<00:00, 20.23it/s]


(Epoch 9) VALID LOSS:3.7435 BLEU:22.01 SacreBLEU:22.12 ROUGE1:48.96 ROUGE2:21.56 ROUGEL:44.57 ROUGELsum:44.59
count stop: 2


(Epoch 10) TRAIN LOSS:0.0914 LR:0.00003874: 100%|████████████████████████████████████████████████████████████████████████████████████| 746/746 [01:57<00:00,  6.35it/s]


(Epoch 10) TRAIN LOSS:0.0914 BLEU:90.94 SacreBLEU:90.06 ROUGE1:95.22 ROUGE2:92.87 ROUGEL:94.82 ROUGELsum:94.82 LR:0.00003874


VALID LOSS:3.8147: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:04<00:00, 20.22it/s]


(Epoch 10) VALID LOSS:3.8147 BLEU:22.18 SacreBLEU:22.31 ROUGE1:49.21 ROUGE2:21.50 ROUGEL:44.72 ROUGELsum:44.75


In [19]:
# Load best model
model.load_state_dict(torch.load(model_dir + "/best_model_0.th"))

<All keys matched successfully>

In [20]:
# Evaluate
test_loss, test_metrics, test_hyp, test_label = evaluate(model, data_loader=test_loader, forward_fn=forward_generation, 
                                                         metrics_fn=generation_metrics_fn, model_type=model_type, 
                                                         tokenizer=tokenizer, beam_size=beam_size, 
                                                         max_seq_len=max_seq_len, is_test=True, 
                                                         device='cuda')

TESTING... : 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 150/150 [02:23<00:00,  1.05it/s]


In [21]:
metrics_scores = []
result_dfs = []

metrics_scores.append(test_metrics)
result_dfs.append(pd.DataFrame({
    'hyp': test_hyp, 
    'label': test_label
}))

result_df = pd.concat(result_dfs)
metric_df = pd.DataFrame.from_records(metrics_scores)

print('== Prediction Result ==')
print(result_df.head())
print()

print('== Model Performance ==')
print(metric_df.describe())

result_df.to_csv(model_dir + "/prediction_result.csv")
metric_df.describe().to_csv(model_dir + "/evaluation_result.csv")

== Prediction Result ==
                                                 hyp  \
0  simkuring nyarita kieu teh, lain ku dum eh ara...   
1         br ak dal alah ar nepi ka sare ub euh eun.   
2  sabab urang teh,  <0xE2> <0x80> <0x94> nu jadi...   
3  nu w angk elang mah, kabeh oge, geus suci. tap...   
4  geus kitu, petrus jeung yahya angkat ka imahna...   

                                               label  
0  da teu terang naon - na on, tur can tangtu sim...  
1    terus kabeh dal alah ar nepi ka sare ub euh na.  
2  sabab urang mah darma dad am elan allah, anu g...  
3  pikeun anu hat ena suci mah, sagala ge suci. s...  
4  sanggeus leupas, petrus jeung yahya nep angan ...  

== Model Performance ==
            BLEU  SacreBLEU     ROUGE1     ROUGE2     ROUGEL  ROUGELsum
count   1.000000   1.000000   1.000000   1.000000   1.000000   1.000000
mean   14.185724  14.180255  32.490315  15.192999  28.393299  28.389189
std          NaN        NaN        NaN        NaN        NaN        