In [1]:
from google.colab import userdata
key = userdata.get('hf-api')

from huggingface_hub import login
login(token=key)

In [None]:
!pip install transformers --upgrade
!pip install datasets
!pip install evaluate

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")


In [7]:
def tokenize_and_align_tags(records):
    # 입력 단어를 토큰으로 분리함. 예를 들어, ChatGPT경우, ['Chat', '##G', '##PT']로 분해
    tokenized_results = tokenizer(records["tokens"], truncation=True, is_split_into_words=True)  # 입력 토큰화, 길이제한, 단어분리됨 표시
    input_tags_list = []  # 정렬된 태그들을 담을 리스트 초기화

    for i, given_tags in enumerate(records["ner_tags"]):
        word_ids = tokenized_results.word_ids(batch_index=i) # 토큰들의 원래 단어 인덱스 가져옴
        previous_word_id = None
        input_tags = []

        for wid in word_ids:
            if wid is None:     # 특수 토큰 (e.g., [CLS], [SEP])
                input_tags.append(-100)   # 무시 태그 추가
            elif wid != previous_word_id:  # 새로운 단어의 첫 토큰
                input_tags.append(given_tags[wid])  # 원래 태그 추가
            else:  # 이전 단어의 subword 토큰
                input_tags.append(-100)
            previous_word_id = wid  # 현재 토큰 인덱스 저장

        input_tags_list.append(input_tags)  # 정렬된 태그 리스트에 추가

    tokenized_results["labels"] = input_tags_list  # 결과에 정렬된 태그들 추가
    return tokenized_results

In [8]:
from datasets import load_dataset
wnut = load_dataset('wnut_17') # WNUT 17 데이터셋을 로드함. (https://huggingface.co/datasets/leondz/wnut_17)
tokenized_wnut = wnut.map(tokenize_and_align_tags, batched=True) # tokenize_and_align_tags 함수를 사용하여 데이터셋을 토큰화하고 태그를 정렬함. batched=True는 배치 처리를 의미함.
tag_names = wnut["test"].features[f"ner_tags"].feature.names  # 테스트 데이터셋에서 ner_tags의 feature 이름들을 가져옴. (즉, 태그 이름 목록임)
id2label = dict(enumerate(tag_names))  # 태그 이름들을 인덱스(ID)와 매핑하는 딕셔너리를 생성함. (예: {0: 'O', 1: 'B-person', ...})
label2id = dict(zip(id2label.values(), id2label.keys()))  # id2label의 값과 키를 뒤집어서 태그 이름을 인덱스로 매핑하는 딕셔너리를 생성함. (예: {'O': 0, 'B-person': 1, ...})


README.md:   0%|          | 0.00/9.05k [00:00<?, ?B/s]

wnut_17.py:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

The repository for wnut_17 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/wnut_17.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/494k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/115k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/192k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3394 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1009 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1287 [00:00<?, ? examples/s]

Map:   0%|          | 0/3394 [00:00<?, ? examples/s]

Map:   0%|          | 0/1009 [00:00<?, ? examples/s]

Map:   0%|          | 0/1287 [00:00<?, ? examples/s]

In [9]:
from transformers import AutoModelForTokenClassification
model = AutoModelForTokenClassification.from_pretrained(
    "dslim/bert-base-NER", num_labels=len(id2label), id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True
)

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at dslim/bert-base-NER and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([13]) in the model instantiated
- classifier.weight: found shape torch.Size([9, 768])

In [10]:
from transformers import Trainer, TrainingArguments, DataCollatorForTokenClassification
training_args = TrainingArguments(
    output_dir="ner-wnut-model",
)

In [12]:
# 하이퍼파라메터 튜닝용
training_args = TrainingArguments(
    output_dir="ner-wnut-model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

In [13]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_wnut["train"],
    eval_dataset=tokenized_wnut["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Trainer(


In [14]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmac999[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,No log,0.247555
2,No log,0.263236


TrainOutput(global_step=426, training_loss=0.14505344265503503, metrics={'train_runtime': 184.7181, 'train_samples_per_second': 36.748, 'train_steps_per_second': 2.306, 'total_flos': 211741856300556.0, 'train_loss': 0.14505344265503503, 'epoch': 2.0})

In [18]:
!ls
!ls ./ner-wnut-model/ -l

ner-wnut-model	sample_data  wandb
total 421744
drwxr-xr-x 2 root root      4096 Apr 26 12:10 checkpoint-213
drwxr-xr-x 2 root root      4096 Apr 26 12:11 checkpoint-426
-rw-r--r-- 1 root root      1251 Apr 26 12:11 config.json
-rw-r--r-- 1 root root 430942044 Apr 26 12:11 model.safetensors
drwxr-xr-x 3 root root      4096 Apr 26 12:08 runs
-rw-r--r-- 1 root root       125 Apr 26 12:11 special_tokens_map.json
-rw-r--r-- 1 root root      1291 Apr 26 12:11 tokenizer_config.json
-rw-r--r-- 1 root root    669021 Apr 26 12:12 tokenizer.json
-rw-r--r-- 1 root root      5304 Apr 26 12:12 training_args.bin
-rw-r--r-- 1 root root    213450 Apr 26 12:12 vocab.txt


In [22]:
from transformers import pipeline
model = AutoModelForTokenClassification.from_pretrained("ner-wnut-model")
tokenizer = AutoTokenizer.from_pretrained("ner-wnut-model")


In [23]:
classifier = pipeline("ner", model=model, tokenizer=tokenizer)
out = classifier("My name is Sylvain and I work at Hugging Face in Brooklyn.")
print(out)


Device set to use cuda:0


[{'entity': 'B-person', 'score': np.float32(0.9480278), 'index': 4, 'word': 'S', 'start': 11, 'end': 12}, {'entity': 'B-person', 'score': np.float32(0.8513039), 'index': 5, 'word': '##yl', 'start': 12, 'end': 14}, {'entity': 'B-person', 'score': np.float32(0.7769072), 'index': 6, 'word': '##va', 'start': 14, 'end': 16}, {'entity': 'B-person', 'score': np.float32(0.7973245), 'index': 7, 'word': '##in', 'start': 16, 'end': 18}, {'entity': 'B-location', 'score': np.float32(0.6619479), 'index': 12, 'word': 'Hu', 'start': 33, 'end': 35}, {'entity': 'B-location', 'score': np.float32(0.52010304), 'index': 13, 'word': '##gging', 'start': 35, 'end': 40}, {'entity': 'I-location', 'score': np.float32(0.64878863), 'index': 14, 'word': 'Face', 'start': 41, 'end': 45}, {'entity': 'B-location', 'score': np.float32(0.93580455), 'index': 16, 'word': 'Brooklyn', 'start': 49, 'end': 57}]


In [27]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=71adf4b6d0dcd62e5e4ed53cd5eb3f2325678237f6c776ecea5420230df4b324
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [28]:
import evaluate
seqeval = evaluate.load("seqeval")
results = trainer.evaluate()
print(results)


{'eval_loss': 0.2475549429655075, 'eval_runtime': 8.0126, 'eval_samples_per_second': 160.622, 'eval_steps_per_second': 10.109, 'epoch': 2.0}


In [31]:

import numpy as np

def compute_metrics(p):
    predictions = p.predictions
    labels = p.label_ids
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


In [33]:
metrics = compute_metrics(trainer.predict(tokenized_wnut["test"]))

import json
print(json.dumps(metrics, indent=4))

{
    "precision": 0.5063775510204082,
    "recall": 0.36793327154772937,
    "f1": 0.42619431025228127,
    "accuracy": 0.9440882277507053
}


In [None]:
trainer.push_to_hub()