In [1]:
import sys, platform, torch
import transformers
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import accelerate
import os
import re
import logging
from tqdm import tqdm
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
"""
pkill -u "$USER" -f python
if memory is insufficient
"""

'\npkill -u "$USER" -f python\nif memory is insufficient\n'

In [3]:
CACHE_DIR = "./models"          # download/cache here
INPUT_CSV = "test.csv"          # expects columns: ID, Question
OUTPUT_CSV = "beomi_polyglot_5.csv"   # submission format: ID, Answer

In [None]:
# model_id = "snunlp/KR-Medium"  # GPT2-like Korean LM
# model_id = "EleutherAI/polyglot-ko-5.8b"  # Mistral-based KULLM successor (wasn't very good)
model_id = "beomi/KoAlpaca-Polyglot-12.8B"

tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir="./models", padding_side="left")

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    load_in_4bit=True,                 # or load_in_8bit=True
    device_map="auto",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
)

device = "cuda" if torch.cuda.is_available() else "cpu"

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards:  18%|█▊        | 5/28 [00:02<00:11,  2.00it/s]

In [None]:
def build_prompt(question):
    return f"""질문에 답하세요.
    - 우선 질문이 객관식인지 주관식인지 판단하세요
    - 객관식이면 숫자 하나를 답으로 쓰세요 (하나만 정답입니다).
    - 주관식이면 한 단락으로 질문의 모든 요소에 답하세요

    질문:
    {question}

    답:
    """

In [None]:
# load dataset and set up logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

test_df = pd.read_csv(INPUT_CSV)
assert {"ID", "Question"}.issubset(test_df.columns), "test.csv must have columns: ID, Question"


In [None]:
UNI_DIGIT_MAP = str.maketrans({
    "①":"1","②":"2","③":"3","④":"4","⑤":"5",
    "⑴":"1","⑵":"2","⑶":"3","⑷":"4","⑸":"5",
    "❶":"1","❷":"2","❸":"3","❹":"4","❺":"5",
})

def extract_answer_from_output(output_text: str) -> str:
    # after "답:"
    if "답:" in output_text:
        ans = output_text.split("답:", 1)[1].strip()
    else:
        ans = output_text.strip()

    # normalize circled digits
    ans = ans.translate(UNI_DIGIT_MAP).strip()

    # if starts with number 1-5 → return only that
    if ans and ans[0] in "12345":
        return ans[0]
    return ans

In [None]:
import math

batch_size = 8  # tune: 4~16 depending on VRAM
prompts = [build_prompt(str(q)) for q in test_df["Question"]]

answers = []
model.eval()

for i in tqdm(range(0, len(prompts), batch_size)):
    batch_prompts = prompts[i:i+batch_size]

    enc = tokenizer(
        batch_prompts,
        return_tensors="pt",
        padding=True,           # pad to longest in batch
        truncation=True,
        max_length=512
    )
    enc.pop("token_type_ids", None)
    enc = {k: v.to(model.device, non_blocking=True) for k, v in enc.items()}

    with torch.inference_mode():
        out = model.generate(
            **enc,
            max_new_tokens=200,         
            do_sample=True,
            no_repeat_ngram_size=3,
            repetition_penalty=1.05,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            use_cache=True,             # (default True) keep it on
            num_beams=1                 # beam search is slower
        )

    decoded = tokenizer.batch_decode(out, skip_special_tokens=True)

    answers.extend(decoded)

100%|██████████| 65/65 [16:31<00:00, 15.25s/it]


In [None]:
answers = [extract_answer_from_output(text) for text in answers]

In [None]:
submission = pd.DataFrame({
    "ID": test_df["ID"],
    "Answer": answers
})

submission.to_csv(OUTPUT_CSV, index=False)