In [None]:
!pip install transformers gradio sentencepiece


In [2]:
from transformers import AutoTokenizer, AutoModel
import torch
import gradio as gr
import re

# 1. Îçî ÎÇòÏùÄ ÌïúÍµ≠Ïñ¥ Î™®Îç∏ Î°úÎìú (KLUE-BERT ÏÇ¨Ïö©)
MODEL_NAME = "klue/bert-base"  # ÎòêÎäî "skt/kobert-base-v1"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# 2. ÌÖçÏä§Ìä∏ Ï†ÑÏ≤òÎ¶¨ Ìï®Ïàò
def preprocess_text(text):
    """ÌïúÍµ≠Ïñ¥ ÌÖçÏä§Ìä∏ Ï†ÑÏ≤òÎ¶¨"""
    # Î∂àÌïÑÏöîÌïú Í≥µÎ∞± Ï†úÍ±∞
    text = re.sub(r'\s+', ' ', text)
    # ÌäπÏàòÎ¨∏Ïûê Ï†ïÎ¶¨ (ÌïÑÏöîÏóê Îî∞Îùº Ï°∞Ï†ï)
    text = re.sub(r'[^\w\sÍ∞Ä-Ìû£.,!?]', '', text)
    return text.strip()

# 3. Í∞úÏÑ†Îêú Ï≤≠ÌÇπ Ìï®Ïàò
def chunk_text_korean(text, max_tokens=50, overlap=10):
    """ÌïúÍµ≠Ïñ¥ ÌÖçÏä§Ìä∏Î•º ÌÜ†ÌÅ∞ Í∏∞Î∞òÏúºÎ°ú Ï≤≠ÌÇπ"""
    # ÌÖçÏä§Ìä∏ Ï†ÑÏ≤òÎ¶¨
    text = preprocess_text(text)

    # ÌÜ†ÌÅ∞Ìôî (special tokens Ìè¨Ìï®)
    tokens = tokenizer.encode(text, add_special_tokens=True, truncation=False)
    chunks = []

    # ÌäπÏàò ÌÜ†ÌÅ∞ Ï†úÍ±∞ ([CLS], [SEP] Îì±)
    if tokens[0] == tokenizer.cls_token_id:
        tokens = tokens[1:]
    if tokens[-1] == tokenizer.sep_token_id:
        tokens = tokens[:-1]

    # Ï≤≠ÌÇπ ÏàòÌñâ
    for i in range(0, len(tokens), max_tokens - overlap):
        chunk = tokens[i:i+max_tokens]
        chunks.append(chunk)

    # ÎîîÏΩîÎî© Ïãú ÌäπÏàò ÌÜ†ÌÅ∞ Ï†úÍ±∞ Î∞è Ï†ïÎ¶¨
    decoded_chunks = []
    for chunk in chunks:
        decoded = tokenizer.decode(
            chunk,
            skip_special_tokens=True,  # ÌäπÏàò ÌÜ†ÌÅ∞ Ï†úÍ±∞
            clean_up_tokenization_spaces=True  # ÌÜ†ÌÅ∞Ìôî Í≥µÎ∞± Ï†ïÎ¶¨
        )
        # Ï∂îÍ∞Ä Ï†ïÎ¶¨
        decoded = re.sub(r'\s+', ' ', decoded).strip()
        decoded_chunks.append(decoded)

    return decoded_chunks

# 4. ÏûÑÎ≤†Îî© ÏÉùÏÑ± Ìï®Ïàò (Í∞úÏÑ†)
def get_embedding(chunk_tokens):
    """Ï≤≠ÌÅ¨Ïóê ÎåÄÌïú ÏûÑÎ≤†Îî© ÏÉùÏÑ±"""
    # ÌäπÏàò ÌÜ†ÌÅ∞ Ï∂îÍ∞Ä
    input_ids = torch.tensor([[tokenizer.cls_token_id] + chunk_tokens + [tokenizer.sep_token_id]])
    attention_mask = torch.ones_like(input_ids)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    # [CLS] ÌÜ†ÌÅ∞Ïùò ÏûÑÎ≤†Îî© ÏÇ¨Ïö© (ÎòêÎäî ÌèâÍ∑† ÌíÄÎßÅ)
    embeddings = outputs.last_hidden_state[:, 0, :]  # [CLS] token
    # embeddings = outputs.last_hidden_state.mean(dim=1)  # ÌèâÍ∑† ÌíÄÎßÅ ÎåÄÏïà

    return embeddings.numpy()

# 5. ÌÜ†ÌÅ∞ Î∂ÑÏÑù Ìï®Ïàò Ï∂îÍ∞Ä
def analyze_tokens(text):
    """ÌÜ†ÌÅ∞ Î∂ÑÏÑù Î∞è UNK ÌÜ†ÌÅ∞ ÌôïÏù∏"""
    tokens = tokenizer.encode(text, add_special_tokens=False)
    token_analysis = []

    for token in tokens:
        decoded = tokenizer.decode([token])
        if token == tokenizer.unk_token_id:
            token_analysis.append(f" [UNK] (ID: {token})")
        else:
            token_analysis.append(f" '{decoded}' (ID: {token})")

    return token_analysis

# 6. Gradio Ïù∏ÌÑ∞ÌéòÏù¥Ïä§ Ìï®Ïàò (Í∞úÏÑ†)
def process_text(text, max_tokens, overlap):
    if not text.strip():
        return "ÌÖçÏä§Ìä∏Î•º ÏûÖÎ†•Ìï¥Ï£ºÏÑ∏Ïöî."

    # ÌÜ†ÌÅ∞ Î∂ÑÏÑù
    token_analysis = analyze_tokens(text)
    unk_count = sum(1 for analysis in token_analysis if "[UNK]" in analysis)

    # Ï≤≠ÌÇπ ÏàòÌñâ
    chunks = chunk_text_korean(text, max_tokens, overlap)
    results = []

    # ÌÜ†ÌÅ∞ Î∂ÑÏÑù Í≤∞Í≥º Ï∂îÍ∞Ä
    results.append(f" **ÌÜ†ÌÅ∞ Î∂ÑÏÑù**")
    results.append(f"- Ï¥ù ÌÜ†ÌÅ∞ Ïàò: {len(token_analysis)}")
    results.append(f"- UNK ÌÜ†ÌÅ∞ Ïàò: {unk_count}")
    if unk_count > 0:
        results.append(f" UNK ÌÜ†ÌÅ∞Ïù¥ {unk_count}Í∞ú Î∞úÍ≤¨ÎêòÏóàÏäµÎãàÎã§.")
    results.append("")

    # Ï≤≠ÌÇπ Í≤∞Í≥º
    results.append(f" **Ï≤≠ÌÇπ Í≤∞Í≥º** (Ï¥ù {len(chunks)}Í∞ú)")
    results.append("")

    for idx, chunk in enumerate(chunks):
        chunk_tokens = tokenizer.encode(chunk, add_special_tokens=False)
        try:
            emb = get_embedding(chunk_tokens)
            results.append(f"**Ï≤≠ÌÇπ {idx+1}:**")
            results.append(f"```\n{chunk}\n```")
            results.append(f" ÌÜ†ÌÅ∞ Ïàò: {len(chunk_tokens)} | ÏûÑÎ≤†Îî© shape: {emb.shape}")
            results.append("")
        except Exception as e:
            results.append(f"**Ï≤≠ÌÇπ {idx+1}:**  Ïò§Î•ò Î∞úÏÉù - {str(e)}")
            results.append("")

    return "\n".join(results)

# 7. ÏòàÏ†ú ÌÖçÏä§Ìä∏ ÏÉùÏÑ±
def get_example_text():
    return """
ÏïàÎÖïÌïòÏÑ∏Ïöî! Ïù¥Í≤ÉÏùÄ ÌïúÍµ≠Ïñ¥ ÌÖçÏä§Ìä∏ Ï≤≠ÌÇπÏùÑ ÌÖåÏä§Ìä∏ÌïòÍ∏∞ ÏúÑÌïú ÏòàÏ†ú Î¨∏Ïû•ÏûÖÎãàÎã§.
ÏûêÏó∞Ïñ¥ Ï≤òÎ¶¨ÏóêÏÑú Í∏¥ ÌÖçÏä§Ìä∏Î•º ÏûëÏùÄ Îã®ÏúÑÎ°ú ÎÇòÎàÑÎäî Í≤ÉÏùÑ Ï≤≠ÌÇπÏù¥ÎùºÍ≥† Ìï©ÎãàÎã§.
Ï≤≠ÌÇπÏùÄ BERTÏôÄ Í∞ôÏùÄ Ìä∏ÎûúÏä§Ìè¨Î®∏ Î™®Îç∏Ïùò ÏµúÎåÄ ÌÜ†ÌÅ∞ Í∏∏Ïù¥ Ï†úÌïúÏùÑ Ìï¥Í≤∞ÌïòÎäî Ï§ëÏöîÌïú Í∏∞Î≤ïÏûÖÎãàÎã§.
ÌïúÍµ≠Ïñ¥Îäî ÍµêÏ∞©Ïñ¥Ïùò ÌäπÏÑ±ÏÉÅ ÌòïÌÉúÏÜå Î∂ÑÏÑùÏù¥ Î≥µÏû°ÌïòÎ©∞, Îã§ÏñëÌïú Ïñ¥ÎØ∏ Î≥ÄÌôîÍ∞Ä ÏûàÏñ¥ ÌÜ†ÌÅ∞Ìôî Í≥ºÏ†ïÏóêÏÑú Ï£ºÏùòÍ∞Ä ÌïÑÏöîÌï©ÎãàÎã§.
"""

# 8. Gradio UI (Í∞úÏÑ†)
demo = gr.Interface(
    fn=process_text,
    inputs=[
        gr.Textbox(
            label="ÌïúÍµ≠Ïñ¥ ÏûÖÎ†• ÌÖçÏä§Ìä∏",
            lines=6,
            placeholder="Í∏¥ ÌïúÍµ≠Ïñ¥ Î¨∏Ïû•ÏùÑ ÏûÖÎ†•ÌïòÏÑ∏Ïöî...",
            value=get_example_text()
        ),
        gr.Slider(20, 200, value=50, step=10, label="ÏµúÎåÄ ÌÜ†ÌÅ∞ Ïàò (max_tokens)"),
        gr.Slider(0, 50, value=10, step=5, label="Ïò§Î≤ÑÎû© ÌÜ†ÌÅ∞ Ïàò (overlap)")
    ],
    outputs=gr.Textbox(label="Ï≤≠ÌÇπ Í≤∞Í≥º Î∞è Î∂ÑÏÑù", lines=20),
    title="üîç ÌïúÍµ≠Ïñ¥ ÌÖçÏä§Ìä∏ Ï≤≠ÌÇπ & ÏûÑÎ≤†Îî© Î∂ÑÏÑùÍ∏∞",
    description="""
    **Í∏∞Îä•:**
    - ÌïúÍµ≠Ïñ¥ ÌÖçÏä§Ìä∏Ïùò ÌÜ†ÌÅ∞ Î∂ÑÏÑù (UNK ÌÜ†ÌÅ∞ Í∞êÏßÄ)
    - ÏßÄÎä•Ìòï ÌÖçÏä§Ìä∏ Ï≤≠ÌÇπ
    - KLUE-BERT Í∏∞Î∞ò ÏûÑÎ≤†Îî© ÏÉùÏÑ±

    **Í∞úÏÑ†ÏÇ¨Ìï≠:**
    - UNK ÌÜ†ÌÅ∞ ÏµúÏÜåÌôîÎ•º ÏúÑÌïú Ï†ÑÏ≤òÎ¶¨
    - Îçî ÎÇòÏùÄ ÌïúÍµ≠Ïñ¥ ÏßÄÏõê Î™®Îç∏ ÏÇ¨Ïö©
    - ÏÉÅÏÑ∏Ìïú ÌÜ†ÌÅ∞ Î∂ÑÏÑù Ï†úÍ≥µ
    """,
    examples=[
        [get_example_text(), 50, 10],
        ["Ïù∏Í≥µÏßÄÎä•Í≥º Î®∏Ïã†Îü¨Îãù Í∏∞Ïà†Ïù¥ Í∏âÏÜçÎèÑÎ°ú Î∞úÏ†ÑÌïòÍ≥† ÏûàÏäµÎãàÎã§.", 30, 5],
        ["ÌïúÍµ≠Ïñ¥ ÏûêÏó∞Ïñ¥ Ï≤òÎ¶¨Îäî ÌòïÌÉúÏÜå Î∂ÑÏÑù, Íµ¨Î¨∏ Î∂ÑÏÑù, ÏùòÎØ∏ Î∂ÑÏÑù Îì± Ïó¨Îü¨ Îã®Í≥ÑÎ•º Í±∞Ïπ©ÎãàÎã§.", 40, 8]
    ]
)

if __name__ == "__main__":
    demo.launch()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://0d5e292686cb97cf65.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
