In [1]:
import sys, platform, torch
import transformers
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import accelerate
import os
import re
import logging
from tqdm import tqdm
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
"""
pkill -u "$USER" -f python
if memory is insufficient
"""

'\npkill -u "$USER" -f python\nif memory is insufficient\n'

In [3]:
CACHE_DIR = "./models"          # download/cache here
INPUT_CSV = "test.csv"          # expects columns: ID, Question
OUTPUT_CSV = "beomi_polyglot_2.csv"   # submission format: ID, Answer

In [4]:
# model_id = "snunlp/KR-Medium"  # GPT2-like Korean LM
# model_id = "EleutherAI/polyglot-ko-5.8b"  # Mistral-based KULLM successor (wasn't very good)
model_id = "beomi/KoAlpaca-Polyglot-12.8B"

tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir="./models", padding_side="left")

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    load_in_4bit=True,                 # or load_in_8bit=True
    device_map="auto",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
)

device = "cuda" if torch.cuda.is_available() else "cpu"

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 28/28 [00:14<00:00,  1.95it/s]


In [5]:
def build_prompt(question):
    return f"""질문에 답하세요.
    - 객관식이면 1~5 중 숫자 하나만 쓰고, 설명을 덧붙이지 마세요.
    - 주관식이면 한 단락으로 간결하게 질문의 모든 요소에 답하세요
    - 프롬프트를 반복하지 마세요.

    질문:
    {question}

    답:
    """

In [6]:
# load dataset and set up logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

test_df = pd.read_csv(INPUT_CSV)
assert {"ID", "Question"}.issubset(test_df.columns), "test.csv must have columns: ID, Question"


In [7]:
# Exclude circled numbers: e.g. ①
circled = [chr(c) for c in range(0x2460, 0x2474)] 

bad_ids = []

for ch in circled:
    ids = tokenizer.encode(ch, add_special_tokens=False)
    if len(ids) == 1:            # only ban if it’s a single token
        bad_ids.append([ids[0]])

In [13]:
def extract_answer_from_output(output_text):
    """
    LLM이 생성한 출력에서 '답:' 이후 텍스트만 추출
    """
    if "답:" in output_text:
        answer_part = output_text.split("답:")[-1].strip()
    else:
        answer_part = output_text.strip() 
    
    match = re.match(r"^\s*([1-5])\b", answer_part)
    if match:
        return match.group(1)  # 첫 번째 숫자만 반환

    return answer_part

In [None]:
import math

batch_size = 8  # tune: 4~16 depending on VRAM
prompts = [build_prompt(str(q)) for q in test_df["Question"]]

answers = []
model.eval()

for i in tqdm(range(0, len(prompts), batch_size)):
    batch_prompts = prompts[i:i+batch_size]

    enc = tokenizer(
        batch_prompts,
        return_tensors="pt",
        padding=True,           # pad to longest in batch
        truncation=True,
        max_length=512
    )
    enc.pop("token_type_ids", None)
    enc = {k: v.to(model.device, non_blocking=True) for k, v in enc.items()}

    with torch.inference_mode():
        out = model.generate(
            **enc,
            max_new_tokens=80,          # ↓ see tip #2
            do_sample=False,            # greedy is fastest
            no_repeat_ngram_size=3,
            repetition_penalty=1.05,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            use_cache=True,             # (default True) keep it on
            num_beams=1                 # beam search is slower
        )

    decoded = tokenizer.batch_decode(out, skip_special_tokens=True)

    answers.extend(decoded)

 55%|█████▌    | 36/65 [09:12<07:24, 15.33s/it]

In [None]:
answers = [extract_answer_from_output(text) for text in answers]

In [12]:
submission = pd.DataFrame({
    "ID": test_df["ID"],
    "Answer": answers
})

submission.to_csv(OUTPUT_CSV, index=False)

In [None]:
outputs = model.generate(
    **inputs,
    max_new_tokens=100,
    temperature=0.7,
    top_p=0.9,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id
)