In [None]:
# installing required packages
# On local machine, please install all of the requirements
# with `python>=3.10`

%pip install transformers

In [None]:
# loading the model and tokenizer
# CAUTION: polyglot-ko checkpoints are as big as > 5GB.
# check your network speed and RAM status before running.

from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM as AutoModel

CKPT = "nojiyoon/nallm-polyglot-ko-1.3b-base"
model = AutoModel.from_pretrained(CKPT)
tokenizer = AutoTokenizer.from_pretrained(CKPT)

In [None]:
# setting up the device
# CAUTION polyglot-ko is a big model. Check GPU status before setting 'cuda'
# (1.3b checkpoint takes around 6GB of VRAM)

# Polyglot-ko inference is _slow_ on 'cpu'
# If possible, 'cuda' acceleartion is better
# Optionally use 'mps' on MacOS with Apple Silicon

import torch

device = 'cuda' if torch.cuda.is_available else \
            'mps' if torch.backends.mps.is_available else \
            'cpu'

print("Will use: " + device)

model = model.to(device)

In [None]:
# set up text
# ensure to format the input text for optimal performance
# (the format made is the format given to the model while fine-tuning)

input = {
    "organization": "경찰청",
    "title": "행정심판의 대상",
    "question": "행정심판의 대상은 무엇인가요?"
}

input = f"<{input['organization']}> {input['title']}\n{input['question']}\n----답변----\n"
print(input)

In [None]:
# generate

out = model.generate(
    input_ids = torch.tensor([tokenizer(input).input_ids]).to(device),
    attention_mask = torch.tensor([tokenizer(input).attention_mask]).to(device),
    max_new_tokens=256,
    num_beams=2,
)[0]

out = tokenizer.decode(out.squeeze(0),
             skip_special_tokens=True, clean_up_tokenization_spaces=True)
print(out)