In [1]:
!pip -q install "transformers>=4.43.3" "accelerate>=0.33.0" \
                "bitsandbytes>=0.43.1" "gradio>=4.36.0" \
                "sentencepiece" "einops"


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import torch, re, unicodedata, random
from threading import Thread
from typing import List, Tuple
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer

MODEL_ID = "buttercoconut/Ko-Llama-3.2-1B-instruct"


def load_model(model_id=MODEL_ID):
    tok = AutoTokenizer.from_pretrained(model_id, use_fast=True)
    if torch.cuda.is_available():
        # 4bit 양자화로 VRAM 절약
        from transformers import BitsAndBytesConfig
        bnb = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
        )
        mdl = AutoModelForCausalLM.from_pretrained(
            model_id,
            quantization_config=bnb,
            device_map="auto",
            torch_dtype=torch.bfloat16,
        )
    else:
        mdl = AutoModelForCausalLM.from_pretrained(
            model_id,
            device_map="cpu",
            torch_dtype=torch.float32,
        )
    return tok, mdl

tokenizer, model = load_model()
DEVICE = model.device
print(f"Loaded: {MODEL_ID} on {DEVICE}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/335 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/879 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/180 [00:00<?, ?B/s]

Loaded: buttercoconut/Ko-Llama-3.2-1B-instruct on cpu


In [3]:
def normalize_ko(s: str) -> str:
    s = unicodedata.normalize("NFC", s)
    s = re.sub(r"[^\n가-힣ㄱ-ㅎㅏ-ㅣ0-9A-Za-z ,\.\-–—~:;\'\"!?()\[\]“”‘’·…%@&/]", " ", s)
    s = re.sub(r"\s*([,.;:!?])\s*", r"\1 ", s)
    s = re.sub(r"[ \t]+", " ", s)
    return s.strip()

SYSTEM_PROMPT_DEFAULT = (
    "당신은 한국어를 능숙하게 구사하는 유능한 대화형 비서입니다. "
    "정확하고 친절하게  답하세요."
)

def build_messages(user_message: str, history: List[Tuple[str,str]], system_prompt: str):
    msgs = [{"role": "system", "content": system_prompt or SYSTEM_PROMPT_DEFAULT}]
    for u, a in history:
        if u: msgs.append({"role": "user", "content": u})
        if a: msgs.append({"role": "assistant", "content": a})
    msgs.append({"role": "user", "content": user_message})
    return msgs


In [4]:
@torch.inference_mode()
def generate_stream(messages, temperature=0.7, top_p=0.9, max_new_tokens=512,
                    repetition_penalty=1.05, no_repeat_ngram_size=3):
    # 채팅 템플릿 → 입력 토큰
    inputs = tokenizer.apply_chat_template(
        messages, add_generation_prompt=True, return_tensors="pt"
    ).to(DEVICE)

    streamer = TextIteratorStreamer(
        tokenizer, skip_special_tokens=True, clean_up_tokenization_spaces=True
    )

    gen_kwargs = dict(
        inputs=inputs,
        streamer=streamer,
        do_sample=True,
        temperature=float(temperature),
        top_p=float(top_p),
        max_new_tokens=int(max_new_tokens),
        repetition_penalty=float(repetition_penalty),
        no_repeat_ngram_size=int(no_repeat_ngram_size),
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

    thread = Thread(target=model.generate, kwargs=gen_kwargs)
    thread.start()
    partial_text = ""
    for new_text in streamer:
        partial_text += new_text
        yield normalize_ko(partial_text)


In [8]:
# 4) Gradio ChatInterface  (수정 버전: 불필요한 인자 제거 + additional_inputs 사용)
def chat_fn(message, history, temperature, top_p, max_new_tokens, system_prompt):
    msgs = build_messages(message, history, system_prompt)
    # 스트리밍 제너레이터 반환
    for partial in generate_stream(
        msgs,
        temperature=temperature,
        top_p=top_p,
        max_new_tokens=max_new_tokens
    ):
        yield partial

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("##  한국어 LLM 챗봇 ")
    with gr.Row():
        with gr.Column(scale=3):
            sys_box = gr.Textbox(
                value=SYSTEM_PROMPT_DEFAULT, label="System Prompt", lines=3
            )
        with gr.Column(scale=2):
            temperature = gr.Slider(0.1, 1.3, value=0.7, step=0.05, label="Temperature")
            top_p       = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p")
            max_tokens  = gr.Slider(64, 1024, value=512, step=16, label="Max New Tokens")

    chat = gr.ChatInterface(
        fn=chat_fn,
        title=f"LLM Chatbot · {MODEL_ID}",
        additional_inputs=[temperature, top_p, max_tokens, sys_box],  # ← 슬라이더/프롬프트 값 전달
    )

demo.launch(share=True)  # 외부 공유 URL 생성 원치 않으면 False


  self.chatbot = Chatbot(


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://d509c574e9a4b572d8.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


