In [10]:
import os, sys
import torch
from transformers import BartForConditionalGeneration, GPT2LMHeadModel, BartModel
from indobenchmark import IndoNLGTokenizer

In [11]:
def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())

# Init Model

In [13]:
# bart_model = BartModel.from_pretrained('indobenchmark/indobart')
gpt_model = GPT2LMHeadModel.from_pretrained('indobenchmark/indogpt')
tokenizer = IndoNLGTokenizer.from_pretrained('indobenchmark/indogpt')

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


# Test GPT Model

In [14]:
gpt_input = torch.LongTensor([tokenizer.encode('<s> aku adalah anak', add_special_tokens=False)])
gpt_out = gpt_model.generate(gpt_input)
tokenizer.decode(gpt_out[0])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
2021-10-17 15:45:47.814024: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


'aku adalah anak pertama dari tiga bersaudara. aku lahir di kota kecil yang sama dengan ayahku.'

In [15]:
gpt_input = torch.LongTensor([tokenizer.encode('<s> aku suka sekali makan', add_special_tokens=False)])
gpt_out = gpt_model.generate(gpt_input)
tokenizer.decode(gpt_out[0])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'aku suka sekali makan di sini. aku suka sekali dengan menu yang ada di sini. aku'

In [16]:
gpt_input = torch.LongTensor([tokenizer.encode('<s> hai, bagaimana kabar', add_special_tokens=False)])
gpt_out = gpt_model.generate(gpt_input)
tokenizer.decode(gpt_out[0])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'hai, bagaimana kabar kalian? semoga sehat selalu ya. kali ini saya akan membahas tentang cara membuat'

# Test BART Model

In [13]:
bart_input = tokenizer.prepare_input_for_generation(['aku adalah <mask>'], model_type='indobart', return_tensors='pt')
bart_out = bart_model(**bart_input)
tokenizer.decode(bart_out.logits.topk(1).indices[:,:,:].squeeze())

'nyaétaselengkapnyahij<mask>'

In [11]:
bart_input = tokenizer.prepare_input_for_generation(['dia adalah anak <mask>','<mask> adalah anak gembala'], 
    decoder_inputs=['dia adalah anak <mask>','aku adalah anak <mask>'], model_type='indobart', return_tensors='pt')
bart_out = bart_model(**bart_input)
tokenizer.decode(bart_out.logits.topk(1).indices[0,:,:].squeeze()), tokenizer.decode(bart_out.logits.topk(1).indices[1,:,:].squeeze())

('cik anak anak<mask> nyaéta', 'gembala gembala gembala<mask> gembala')

In [19]:
bart_model.generate(**bart_input, decoder_start_token_id=100)[0]

tensor([  100, 23273, 23273, 23273,  6278,  6278,  6278, 39983, 39983, 39983,
          368,   368,   368,   756,   756,   756,  2863,  2863,  2863,     2])

In [None]:
tokenizer.decode([2])

''