In [1]:
# 🧩 시퀀스 라벨링(Token Classification) – 토큰 단위 NER
!pip -q install -U transformers datasets accelerate seqeval sentencepiece

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer
import numpy as np, torch, random
from seqeval.metrics import f1_score, classification_report

SEED=42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

# 1) 데이터 로드
ds = load_dataset("klue", "ner")
label_list = ds["train"].features["ner_tags"].feature.names
id2label = {i:l for i,l in enumerate(label_list)}
label2id = {l:i for i,l in enumerate(label_list)}

# 2) 토크나이저/모델
model_name = "klue/roberta-small"
tok = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModelForTokenClassification.from_pretrained(
    model_name, num_labels=len(label_list), id2label=id2label, label2id=label2id
)

# 3) 토큰-라벨 정렬
def tokenize_and_align(batch):
    tokenized = tok(batch["tokens"], truncation=True, is_split_into_words=True, max_length=192)
    labels = []
    for i, tags in enumerate(batch["ner_tags"]):
        word_ids = tokenized.word_ids(batch_index=i)
        prev = None; label_ids=[]
        for w in word_ids:
            if w is None:
                label_ids.append(-100)
            elif w != prev:
                label_ids.append(tags[w]); prev = w
            else:
                # 서브워드에는 I-태그가 일반적이지만 여기선 그대로 tag 붙임
                label_ids.append(tags[w])
        labels.append(label_ids)
    tokenized["labels"] = labels
    return tokenized

encoded = ds.map(tokenize_and_align, batched=True)

collator = DataCollatorForTokenClassification(tok)

# 4) 메트릭
def compute_metrics(p):
    logits, labels = p
    preds = np.argmax(logits, axis=-1)
    true_entities, pred_entities = [], []
    for p_vec, l_vec in zip(preds, labels):
        true_seq=[]; pred_seq=[]
        for p_id, l_id in zip(p_vec, l_vec):
            if l_id == -100:
                continue
            true_seq.append(label_list[l_id])
            pred_seq.append(label_list[p_id])
        true_entities.append(true_seq)
        pred_entities.append(pred_seq)
    return {"f1_seqeval": f1_score(true_entities, pred_entities)}

# 5) 학습
args = TrainingArguments(
    output_dir="./klue-ner-roberta-small",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    eval_strategy="epoch",
    save_strategy="no",
    logging_steps=50,
    fp16=torch.cuda.is_available(),
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=encoded["train"].shuffle(SEED).select(range(4000)),
    eval_dataset=encoded["validation"],
    tokenizer=tok,
    data_collator=collator,
    compute_metrics=compute_metrics,
)
trainer.train()
print(trainer.evaluate())

# 6) 추론 예시
sent = "카카오가 판교 본사에서 네이버와 공동 세미나를 개최했다."
enc = tok(sent, return_tensors="pt", truncation=True, max_length=192).to(model.device) # Tokenize the sentence directly
with torch.no_grad():
    outputs = model(**enc)
    logits = outputs[0].softmax(-1).cpu().numpy() # Access the first element of the tuple
pred_ids = logits.argmax(-1)[0] # Get predictions for the first sequence

word_ids = enc.word_ids(0)
output_tokens = []
current_word = []
current_tags = []

for i, word_id in enumerate(word_ids):
    if word_id is None:
        continue
    if word_id != (word_ids[i-1] if i>0 else None):
        if current_word:
            output_tokens.append(("".join(current_word), current_tags[0].replace("B-", "").replace("I-", ""))) # Append the previous word and its entity type
        current_word = [tok.convert_ids_to_tokens([enc["input_ids"][0][i]])[0].replace("##", "")] # Start a new word
        current_tags = [id2label[pred_ids[i]]]
    else:
        current_word.append(tok.convert_ids_to_tokens([enc["input_ids"][0][i]])[0].replace("##", ""))
        current_tags.append(id2label[pred_ids[i]])

if current_word: # Append the last word
    output_tokens.append(("".join(current_word), current_tags[0].replace("B-", "").replace("I-", "")))

print(output_tokens)

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m64.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

ner/train-00000-of-00001.parquet:   0%|          | 0.00/4.21M [00:00<?, ?B/s]

ner/validation-00000-of-00001.parquet:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/21008 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5000 [00:00<?, ? examples/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/971 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/273M [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/21008 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1 Seqeval
1,0.3573,0.366096,0.457987


{'eval_loss': 0.3660961389541626, 'eval_f1_seqeval': 0.457987314457595, 'eval_runtime': 4.8586, 'eval_samples_per_second': 1029.103, 'eval_steps_per_second': 64.422, 'epoch': 1.0}
[('카카오가', 'O'), ('판교', 'O'), ('본사에서', 'O'), ('네이버와', 'O'), ('공동', 'O'), ('세미나를', 'O'), ('개최했다', 'O'), ('.', 'O')]
