In [1]:
from google.colab import drive
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [2]:
!pip install -y torch torchvision torchaudio bitsandbytes


Usage:   
  pip3 install [options] <requirement specifier> [package-index-options] ...
  pip3 install [options] -r <requirements file> [package-index-options] ...
  pip3 install [options] [-e] <vcs project url> ...
  pip3 install [options] [-e] <local project path> ...
  pip3 install [options] <archive url/path> ...

no such option: -y


In [3]:
!pip install --upgrade huggingface_hub



# run

In [4]:
!pip -q install --upgrade fastapi uvicorn nest_asyncio transformers huggingface_hub pycloudflared pyngrok

In [5]:
import os, re
import time
import torch
import threading, socket, contextlib
from dataclasses import dataclass, field
from typing import Optional, Dict, Tuple

import nest_asyncio
import uvicorn
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel

from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from huggingface_hub import HfFolder


def load_hf_token_from_colab_secret(name="HF_TOKEN"):
    try:
        from google.colab import userdata
        tok = userdata.get(name)
        if tok and tok.strip():
            return tok.strip()
    except Exception:
        pass

    tok = os.getenv(name, "").strip()
    return tok or None

HF_TOKEN = load_hf_token_from_colab_secret("HF_TOKEN")
print("HF_TOKEN loaded:", "YES" if HF_TOKEN else "NO")

# gpu check
def _fmt_mb(b: int) -> str:
    return f"{b/(1024**2):.2f} MB"

def _gpu_stats(tag: str) -> str:
    if not torch.cuda.is_available():
        return f"[GPU] {tag}: CUDA 미사용"
    torch.cuda.synchronize()
    a = torch.cuda.memory_allocated()
    r = torch.cuda.memory_reserved()
    p = torch.cuda.max_memory_allocated()
    return f"[GPU] {tag} | 현재 할당: {_fmt_mb(a)} | 현재 예약: {_fmt_mb(r)} | 피크 할당: {_fmt_mb(p)}"

def _select_dtype_and_device_map():
    if torch.cuda.is_available():
        return torch.float16, "auto"
    elif torch.backends.mps.is_available():
        return torch.float16, {"": "mps"}
    return torch.float32, {"": "cpu"}

# model load & generate
@dataclass
class ModelConfig:
    repo_or_path: str = "kkuriyoon/QLoRA-ax4-StoryTeller"
    hf_token: Optional[str] = HF_TOKEN
    trust_remote_code: bool = True
    low_cpu_mem_usage: bool = False
    use_safetensors: bool = True

@dataclass
class GenConfig:
    max_new_tokens: int = 280
    temperature: float = 0.8
    top_p: float = 0.9
    do_sample: bool = True
    repetition_penalty: float = 1.05
    def to_hf(self) -> GenerationConfig:
        return GenerationConfig(
            max_new_tokens=self.max_new_tokens,
            temperature=self.temperature,
            top_p=self.top_p,
            do_sample=self.do_sample,
            repetition_penalty=self.repetition_penalty,
        )

@dataclass
class PromptConfig:
    length_hint: str = "6~8문장"
    reading_level: str = "해당 나이 또래가 술술 읽을 수 있는 난이도"
    safety: str = "폭력/공포/혐오/연령불가 요소 금지"
    style_override: Optional[str] = None
    genre_guides: Dict[str, str] = field(default_factory=lambda: {
        "동화": "따뜻하고 포근한 톤, 일상적 갈등과 작은 해결, 의성어/의태어 소량",
        "모험": "경쾌한 진행, 목표-장애-성장의 3막, 공간 이동과 작은 퀘스트",
        "미스터리": "부드러운 호기심 유발, 위험 최소화, 단서-추론-해결의 흐름",
        "판타지": "상상력 가득한 세계관, 마법/상징을 은유적으로 사용",
        "SF": "미래/과학 요소를 쉽고 안전하게 설명, 기술은 친근한 도구처럼",
        "일상": "친구/가족/학교/동네 등 공감 포인트 중심의 소소한 사건",
        "동시": "리듬/반복/이미지를 살린 운율, 짧은 행과 명료한 메시지",
    })

# main
class StoryTeller:
    def __init__(self, model_cfg: ModelConfig, gen_cfg: Optional[GenConfig] = None, prompt_cfg: Optional[PromptConfig] = None):
        self.model_cfg = model_cfg
        self.gen_cfg = gen_cfg or GenConfig()
        self.prompt_cfg = prompt_cfg or PromptConfig()
        self.tokenizer: Optional[AutoTokenizer] = None
        self.model: Optional[AutoModelForCausalLM] = None
        self.load_sec: Optional[float] = None

    def load(self) -> Tuple[AutoTokenizer, AutoModelForCausalLM, float]:
        print("Loading merged model from HF Hub / local path...")
        t0 = time.time()

        torch_dtype, device_map = _select_dtype_and_device_map()

        # repo 구분
        token_kw = {"token": self.model_cfg.hf_token} if self.model_cfg.hf_token else {}

        tok = AutoTokenizer.from_pretrained(
            self.model_cfg.repo_or_path,
            use_fast=True,
            trust_remote_code=self.model_cfg.trust_remote_code,
            **token_kw
        )
        if tok.pad_token_id is None and tok.eos_token_id is not None:
            tok.pad_token = tok.eos_token

        model = AutoModelForCausalLM.from_pretrained(
            self.model_cfg.repo_or_path,
            torch_dtype=torch_dtype,
            device_map=device_map,
            trust_remote_code=self.model_cfg.trust_remote_code,
            low_cpu_mem_usage=self.model_cfg.low_cpu_mem_usage,
            use_safetensors=True,
            **token_kw
        )
        model.config.use_cache = True

        if torch.cuda.is_available():
            torch.backends.cuda.matmul.allow_tf32 = True
            torch.backends.cudnn.allow_tf32 = True

        self.tokenizer = tok
        self.model = model
        self.load_sec = time.time() - t0

        print(f"Loaded in {self.load_sec:.2f}s")
        print(_gpu_stats("로드 직후"))
        return tok, model, self.load_sec

    def build_prompt(self, name: str, age: int, genre: str) -> str:
        pc = self.prompt_cfg
        guide = pc.genre_guides.get(genre, "장르적 관습을 유아 친화적으로 순화하여 반영")
        if pc.style_override:
            guide = pc.style_override

        return f"""당신은 아동 문학 작가이자 언어발달 코치입니다. 아래 정보를 반영하여 {age}살 아이 '{name}'에게 딱 맞는 {genre} 장르 이야기(한국어)를 작성하세요.

[독자 정보]
- 이름: {name}
- 나이: {age}세
- 읽기 난이도: {pc.reading_level}

[장르 가이드]
- {guide}

[스토리 구성(제로샷 지시)]
1) 첫 문장은 자연스럽게 상황/장면/감정 중 하나로 시작합니다. 특정 고정 문구로 시작하지 마세요.
2) {name}이(가) 겪는 작은 어려움 → 시도/도움 → 스스로(또는 친구와 함께) 해결.
3) 장면 전환은 과도하지 않게 2~3회 이내로.
4) 대사는 2~4곳에만 자연스럽게 섞되, 과도한 감탄사/반복은 지양.
5) 비유/상징은 나이에 맞게 쉬운 단어로.

[길이]
- 문장 수: {pc.length_hint}

[안전/윤리]
- {pc.safety}
- 실제 인물·브랜드·정치적 사안 언급 금지.
- 표절 금지, 전개는 새롭게 구성.

[출력 형식(중요)]
- 이야기 본문
- 공백 한 줄
- 제목: (이야기에 어울리는 짧고 인상적인 책 제목)

이제 '{genre}' 장르의 이야기를 작성하세요."""

    @torch.inference_mode()
    def generate(self, prompt: str) -> Dict[str, object]:
        assert self.tokenizer is not None and self.model is not None, "먼저 load()를 호출해 모델을 로드하세요."
        tok = self.tokenizer
        model = self.model

        inputs = tok(prompt, return_tensors="pt", truncation=True, max_length=1024)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}

        if torch.cuda.is_available():
            torch.cuda.reset_peak_memory_stats()
            torch.cuda.synchronize()

        # warmup
        _ = model.generate(**{k: v[:, :4] for k, v in inputs.items()}, max_new_tokens=1, do_sample=False)

        cfg = self.gen_cfg.to_hf()
        t0 = time.time()
        out = model.generate(
            **inputs,
            max_new_tokens=cfg.max_new_tokens,
            temperature=cfg.temperature,
            top_p=cfg.top_p,
            do_sample=cfg.do_sample,
            repetition_penalty=cfg.repetition_penalty,
            eos_token_id=tok.eos_token_id,
            pad_token_id=tok.pad_token_id,
            use_cache=True,
        )
        if torch.cuda.is_available():
            torch.cuda.synchronize()
        gen_sec = time.time() - t0

        text = tok.decode(out[0], skip_special_tokens=True)
        if text.startswith(prompt):
            text = text[len(prompt):].lstrip()

        new_tokens = out[0].shape[0] - inputs["input_ids"].shape[1]
        tokps = new_tokens / max(gen_sec, 1e-6)

        gpu_summary = None
        if torch.cuda.is_available():
            gpu_summary = {
                "alloc_now": _fmt_mb(torch.cuda.memory_allocated()),
                "reserved_now": _fmt_mb(torch.cuda.memory_reserved()),
                "peak_alloc": _fmt_mb(torch.cuda.max_memory_allocated()),
            }

        return {
            "text": text,
            "gen_sec": gen_sec,
            "new_tokens": new_tokens,
            "tokps": tokps,
            "gpu": gpu_summary,
        }

# title, content
def _split_title_content(text: str) -> Tuple[str, str]:
    s = text.strip()

    # title 추출
    titles = re.findall(r"제목\s*:\s*(.+)", s)
    title = titles[0].strip() if titles else "제목 없음"

    # title 후처리
    title = re.sub(r"\*\*", "", title)
    title = title.replace("\\", " ")
    title = title.replace("\n", " ")
    title = re.sub(r"\s+", " ", title).strip()

    # content 후처리
    content = re.sub(r"제목\s*:\s*.+", "", s)
    content = re.sub(r"-{3,}", " ", content)
    content = re.sub(r"#", " ", content)
    content = re.sub(r"\*\*", "", content)
    content = content.replace("\\", "")
    content = content.replace("\n", " ")
    content = re.sub(r"\s+", " ", content).strip()

    return title, content


def _split_sentences_kor(s: str):
    s = re.sub(r"\s+", " ", s.strip())
    # '다.', '요.' 또는 일반 문장 부호로 끝나는 구간을 문장으로 간주
    pat = re.compile(r'.*?(?:다\.|요\.|[.!?…])')
    sentences = pat.findall(s)
    tail = s[sum(len(x) for x in sentences):].strip()
    if tail:
        sentences.append(tail)
    # 공백/잡문 제거
    sentences = [x.strip() for x in sentences if x.strip()]
    return sentences



def _paginate_content(content: str, sentences_per_page: int = 3, max_chars: int = 600):
    sents = _split_sentences_kor(content)
    pages = []
    buf = []
    for i, sent in enumerate(sents, 1):
        buf.append(sent)
        if (i % sentences_per_page) == 0:
            page = " ".join(buf).strip()
            pages.append(page)
            buf = []
    if buf:
        pages.append(" ".join(buf).strip())

    fixed = []
    for p in pages:
        if len(p) <= max_chars:
            fixed.append(p)
        else:
            chunk = []
            cur = 0
            words = p.split(" ")
            for w in words:
                if cur + len(w) + (1 if chunk else 0) > max_chars:
                    fixed.append(" ".join(chunk))
                    chunk = [w]
                    cur = len(w)
                else:
                    if chunk:
                        chunk.append(w); cur += len(w) + 1
                    else:
                        chunk = [w]; cur = len(w)
            if chunk:
                fixed.append(" ".join(chunk))
    return fixed


# fastapi
app = FastAPI(title="StoryTeller API", version="1.0.0")
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

STORY: Optional[StoryTeller] = None

class GenerateRequest(BaseModel):
    name: str
    age: int
    genre: str
    max_new_tokens: Optional[int] = None
    temperature: Optional[float] = None
    top_p: Optional[float] = None
    repetition_penalty: Optional[float] = None
    style_override: Optional[str] = None
    length_hint: Optional[str] = None

# title/content만 반환하도록 축소
class GenerateResponse(BaseModel):
    title: str
    content: str
    pages: list[str]
    page_count: int
    contents: str

@app.on_event("startup")
def _startup():
    global STORY
    model_cfg = ModelConfig()
    gen_cfg = GenConfig()
    prompt_cfg = PromptConfig()
    STORY = StoryTeller(model_cfg, gen_cfg, prompt_cfg)
    STORY.load()

@app.get("/health")
def health():
    return {
        "status": "ok",
        "model_loaded": STORY is not None and STORY.model is not None,
        "gpu": _gpu_stats("헬스체크"),
    }

@app.post("/generate", response_model=GenerateResponse)
def generate(req: GenerateRequest):
    assert STORY is not None, "Server not initialized"
    # 동적 파라미터 반영
    if req.max_new_tokens is not None: STORY.gen_cfg.max_new_tokens = req.max_new_tokens
    if req.temperature is not None: STORY.gen_cfg.temperature = req.temperature
    if req.top_p is not None: STORY.gen_cfg.top_p = req.top_p
    if req.repetition_penalty is not None: STORY.gen_cfg.repetition_penalty = req.repetition_penalty
    if req.style_override is not None: STORY.prompt_cfg.style_override = req.style_override
    if req.length_hint is not None: STORY.prompt_cfg.length_hint = req.length_hint

    prompt = STORY.build_prompt(name=req.name, age=req.age, genre=req.genre)
    result = STORY.generate(prompt)

    # 생성 텍스트에서 제목/내용만 추출
    title, content = _split_title_content(result["text"])
    pages = _paginate_content(content, sentences_per_page=3, max_chars=600)

    # 페이지별 \n 구분
    contents = "\n\n--- Page Break ---\n\n".join(pages)

    return GenerateResponse(
        title=title,
        content=content,
        pages=pages,
        page_count=len(pages),
        contents=contents
    )

# run
PORT = int(os.environ.get("PORT", 8001))

def is_port_open(port: int) -> bool:
    with contextlib.closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
        s.settimeout(0.2)
        return s.connect_ex(("127.0.0.1", port)) == 0

_server_thread = None
def run_uvicorn_background(app, host="0.0.0.0", port=PORT, log_level="info"):
    global _server_thread
    if _server_thread and _server_thread.is_alive():
        return
    def _target():
        nest_asyncio.apply()
        uvicorn.run(app, host=host, port=port, log_level=log_level)
    _server_thread = threading.Thread(target=_target, daemon=True)
    _server_thread.start()
    for _ in range(80):
        if is_port_open(port):
            print(f"[OK] Uvicorn running on http://127.0.0.1:{port}")
            return
        time.sleep(0.25)
    raise RuntimeError("Uvicorn not started in time")

run_uvicorn_background(app)

# Cloudflare
try:
    from pycloudflared import try_cloudflare
    public_url_cf = try_cloudflare(port=PORT)
    print("🌐 Cloudflare URL:", public_url_cf)
    print("  - GET :", f"{public_url_cf}/health")
    print("  - POST:", f"{public_url_cf}/generate")
except Exception as e:
    print("[warn] Cloudflare tunnel skipped:", repr(e))

# ngrok
USE_NGROK = False
if USE_NGROK:
    try:
        from pyngrok import ngrok
        REAL_NGROK_TOKEN = os.getenv("NGROK_TOKEN", "")
        if REAL_NGROK_TOKEN:
            ngrok.set_auth_token(REAL_NGROK_TOKEN)
            public_url_ng = ngrok.connect(addr=PORT, proto="http")
            print(" ngrok URL    :", public_url_ng.public_url)
        else:
            print("[warn] NGROK_TOKEN 비어있음, ngrok 생략")
    except Exception as e:
        print("[warn] ngrok skipped:", repr(e))

        on_event is deprecated, use lifespan event handlers instead.

        Read more about it in the
        [FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
        
  @app.on_event("startup")
INFO:     Started server process [51079]
INFO:     Waiting for application startup.


HF_TOKEN loaded: YES
Loading merged model from HF Hub / local path...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8001 (Press CTRL+C to quit)


Loaded in 10.24s
[GPU] 로드 직후 | 현재 할당: 13849.14 MB | 현재 예약: 13870.00 MB | 피크 할당: 13849.14 MB
[OK] Uvicorn running on http://127.0.0.1:8001
 * Running on https://medium-exp-dozens-resource.trycloudflare.com
 * Traffic stats available on http://127.0.0.1:20241/metrics
🌐 Cloudflare URL: Urls(tunnel='https://medium-exp-dozens-resource.trycloudflare.com', metrics='http://127.0.0.1:20241/metrics', process=<Popen: returncode: None args: ['/usr/local/lib/python3.11/dist-packages/pyc...>)
  - GET : Urls(tunnel='https://medium-exp-dozens-resource.trycloudflare.com', metrics='http://127.0.0.1:20241/metrics', process=<Popen: returncode: None args: ['/usr/local/lib/python3.11/dist-packages/pyc...>)/health
  - POST: Urls(tunnel='https://medium-exp-dozens-resource.trycloudflare.com', metrics='http://127.0.0.1:20241/metrics', process=<Popen: returncode: None args: ['/usr/local/lib/python3.11/dist-packages/pyc...>)/generate


In [None]:
import os, time, torch, re
from typing import Optional, List
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
from huggingface_hub import login

REPO_ID = "kkuriyoon/QLoRA-ax4-StoryTeller"
my_hf_token = " "


def _fmt_mb(b): return f"{b/(1024**2):.2f} MB"
def _gpu_stats(tag):
    if not torch.cuda.is_available(): return f"[GPU] {tag}: CUDA 미사용"
    torch.cuda.synchronize()
    a, r, p = torch.cuda.memory_allocated(), torch.cuda.memory_reserved(), torch.cuda.max_memory_allocated()
    return f"[GPU] {tag} | 현재 할당: {_fmt_mb(a)} | 현재 예약: {_fmt_mb(r)} | 피크 할당: {_fmt_mb(p)}"

def _select_dtype_and_device_map():
    if torch.cuda.is_available(): return torch.float16, "auto"
    elif torch.backends.mps.is_available(): return torch.float16, {"": "mps"}
    return torch.float32, "auto"

# 모델 로드
def load_model(repo_id: str, token: Optional[str] = None):
    if token:
        try:
            login(token=token, add_to_git_credential=True)
        except Exception:
            pass

    print("Loading merged model from HF Hub...")
    t0 = time.time()

    def _try_load(token_arg):
        tok = AutoTokenizer.from_pretrained(
            repo_id,
            use_fast=True,
            trust_remote_code=True,
            **({"token": token_arg} if token_arg else {})
        )
        if tok.pad_token_id is None and tok.eos_token_id is not None:
            tok.pad_token = tok.eos_token

        torch_dtype, device_map = _select_dtype_and_device_map()
        model = AutoModelForCausalLM.from_pretrained(
            repo_id,
            torch_dtype=torch_dtype,
            device_map=device_map,
            trust_remote_code=True,
            low_cpu_mem_usage=False,
            use_safetensors=True,
            **({"token": token_arg} if token_arg else {})
        )
        return tok, model

    tok, model = _try_load(token)
    model.config.use_cache = True
    if torch.cuda.is_available():
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True

    load_sec = time.time() - t0
    print(f"Loaded in {load_sec:.2f}s")
    print(_gpu_stats("로드 직후"))
    return tok, model, load_sec

# prompt
def default_gen_cfg(short: bool = False) -> GenerationConfig:
    return GenerationConfig(
        max_new_tokens=40 if short else 280,
        temperature=0.8,
        top_p=0.9,
        do_sample=True,
        repetition_penalty=1.05,
    )

def build_prompt(
    name: str,
    age: int,
    genre: str,
    length_hint: str = "8~12문장",
    reading_level: str = "해당 나이 또래가 술술 읽을 수 있는 난이도",
    safety: str = "폭력/공포/혐오/연령불가 요소 금지",
    style_override: Optional[str] = None,
):
    genre_guides = {
        "동화": "따뜻하고 포근한 톤, 일상적 갈등과 작은 해결, 의성어/의태어 소량",
        "모험": "경쾌한 진행, 목표-장애-성장의 3막, 공간 이동과 작은 퀘스트",
        "미스터리": "부드러운 호기심 유발, 위험 최소화, 단서-추론-해결의 흐름",
        "판타지": "상상력 가득한 세계관, 마법/상징을 은유적으로 사용",
        "SF": "미래/과학 요소를 쉽고 안전하게 설명, 기술은 친근한 도구처럼",
        "일상": "친구/가족/학교/동네 등 공감 포인트 중심의 소소한 사건",
        "동시": "리듬/반복/이미지를 살린 운율, 짧은 행과 명료한 메시지",
    }
    guide = genre_guides.get(genre, "장르적 관습을 유아 친화적으로 순화하여 반영")

    if style_override:
        guide = style_override

    return f"""당신은 아동 문학 작가이자 언어발달 코치입니다. 아래 정보를 반영하여 {age}살 아이 '{name}'에게 딱 맞는 {genre} 장르 이야기(한국어)를 작성하세요.

[독자 정보]
- 이름: {name}
- 나이: {age}세
- 읽기 난이도: {reading_level}

[장르 가이드]
- {guide}

[스토리 구성(제로샷 지시)]
1) 첫 문장은 자연스럽게 상황/장면/감정 중 하나로 시작합니다. 특정 고정 문구로 시작하지 마세요.
2) {name}이(가) 겪는 작은 어려움 → 시도/도움 → 스스로(또는 친구와 함께) 해결.
3) 장면 전환은 과도하지 않게 2~3회 이내로.
4) 대사는 2~4곳에만 자연스럽게 섞되, 과도한 감탄사/반복은 지양.
5) 비유/상징은 나이에 맞게 쉬운 단어로.

[길이]
- 문장 수: {length_hint}

[안전/윤리]
- {safety}
- 실제 인물·브랜드·정치적 사안 언급 금지.
- 표절 금지, 전개는 새롭게 구성.

[출력 형식(중요)]
- 이야기 본문
- 공백 한 줄
- 제목: (이야기에 어울리는 짧고 인상적인 책 제목)

이제 '{genre}' 장르의 이야기를 작성하세요."""

# generate
@torch.inference_mode()
def generate_story(tokenizer, model, prompt, gen_cfg: Optional[GenerationConfig] = None):
    gen_cfg = gen_cfg or default_gen_cfg(short=False)
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    if torch.cuda.is_available():
        torch.cuda.reset_peak_memory_stats()
        torch.cuda.synchronize()

    t0 = time.time()
    # 워밍업
    _ = model.generate(**{k: v[:, :4] for k, v in inputs.items()}, max_new_tokens=1, do_sample=False)

    out = model.generate(
        **inputs,
        max_new_tokens=gen_cfg.max_new_tokens,
        temperature=gen_cfg.temperature,
        top_p=gen_cfg.top_p,
        do_sample=gen_cfg.do_sample,
        repetition_penalty=gen_cfg.repetition_penalty,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        use_cache=True,
    )
    if torch.cuda.is_available():
        torch.cuda.synchronize()
    gen_sec = time.time() - t0

    text = tokenizer.decode(out[0], skip_special_tokens=True)
    if text.startswith(prompt):
        text = text[len(prompt):].lstrip()

    new_tokens = out[0].shape[0] - inputs["input_ids"].shape[1]
    tokps = new_tokens / max(gen_sec, 1e-6)

    gpu_summary = None
    if torch.cuda.is_available():
        gpu_summary = {
            "alloc_now": _fmt_mb(torch.cuda.memory_allocated()),
            "reserved_now": _fmt_mb(torch.cuda.memory_reserved()),
            "peak_alloc": _fmt_mb(torch.cuda.max_memory_allocated()),
        }

    return text, gen_sec, new_tokens, tokps, gpu_summary

# run
if __name__ == "__main__":
    tokenizer, model, load_sec = load_model(REPO_ID, token=my_hf_token)

    prompt = build_prompt(name="나린", age=7, genre="판타지")

    story_full, gen_sec, new_tokens, tokps, gpu_summary = generate_story(
        tokenizer, model, prompt
    )

    print("\n 📍 동화 출력 \n")
    print(story_full)

    print("\n 📍 요약(SUMMARY)")
    print(f"- 모델 로드 시간: {load_sec:.2f} s")
    print(f"- 생성 시간: {gen_sec:.2f} s | 생성 토큰: {new_tokens} | 속도: {tokps:.2f} tok/s")
    if gpu_summary:
        print(f"- GPU 현재 할당: {gpu_summary['alloc_now']}")
        print(f"- GPU 현재 예약: {gpu_summary['reserved_now']}")
        print(f"- GPU 피크 할당(이번 생성 기준): {gpu_summary['peak_alloc']}")
    else:
        print("- GPU 미사용(CPU/MPS) 환경")

Loading merged model from HF Hub...


OSError: There was a specific connection error when trying to load kkuriyoon/QLoRA-ax4-StoryTeller:
401 Client Error: Unauthorized for url: https://huggingface.co/kkuriyoon/QLoRA-ax4-StoryTeller/resolve/main/config.json (Request ID: Root=1-68a1b641-32ab4df7665299880c3e2d7d;13872f0d-e684-4323-a928-a4c87e2689f0)

Invalid credentials in Authorization header

In [6]:
# 환경변수에 토큰이 잡혀 있으면 비우기
import os
for k in ["HF_TOKEN", "HUGGINGFACE_HUB_TOKEN", "HUGGINGFACEHUB_API_TOKEN", "HF_HOME"]:
    if k in os.environ: os.environ.pop(k)

# huggingface_hub 캐시된 토큰 제거
from huggingface_hub import HfFolder
try:
    HfFolder.delete_token()
    print("HF cached token removed")
except Exception as e:
    print("skip:", e)

# netrc가 있으면 requests가 자동 로그인 → 이름 변경
import os, pathlib, shutil
p = pathlib.Path.home()/".netrc"
if p.exists():
    shutil.move(str(p), str(p)+".bak")
    print("~/.netrc -> ~/.netrc.bak")

# (D) git-credentials에 huggingface.co 줄이 있으면 삭제 권장
!if [ -f ~/.git-credentials ]; then sed -n '1,200p' ~/.git-credentials; fi

HF cached token removed
