In [2]:
import kagglehub
import chardet
from pathlib import Path
import pandas as pd
import torch

path = kagglehub.dataset_download("debasisdotcom/name-entity-recognition-ner-dataset")
dir = Path(path)
path_lst = [path for path in dir.rglob('*')]
data_path = path_lst[0]

device = 'cuda' if torch.cuda.is_available() else 'cpu'

with open(data_path, 'rb') as file:
    encoding = chardet.detect(file.read())

In [3]:
learning_rate=2e-5
per_device_train_batch_size=16
per_device_eval_batch_size=16
num_train_epochs=2
weight_decay=0.01

In [4]:
# 데이터셋입니다.
df = pd.read_csv(data_path, encoding='Windows-1252')
# ner과 pos에 대한 설명이 있는 파일입니다. 
ner_df = pd.read_csv("data/NER_tags.csv")
ner_unique = list(ner_df['NER 태그'][1:].apply(lambda x: x.split('-')[1]).unique())
ner_unique.append("O")
index_result = []
for unique_tag in ner_unique:
    for i, tag in enumerate(ner_df['NER 태그'].to_list()):
        if len(tag) > 1 and unique_tag==tag.split('-')[1]:
            index_result.append(i)

        elif len(tag) == 1 and unique_tag==tag:
            index_result.append(i)
        

ner_df = ner_df.iloc[index_result].reset_index(drop=True)
pos_df = pd.read_csv("data/POS_tags.csv")

In [5]:
from IPython.display import display
display(ner_df[['NER 태그','설명']])
print()
display(pos_df[['POS 태그','설명']])

Unnamed: 0,NER 태그,설명
0,B-geo,위치(지리적 개체)의 시작
1,I-geo,위치(지리적 개체)의 내부
2,B-tim,시간 표현의 시작
3,I-tim,시간 표현의 내부
4,B-org,조직(Organization)의 시작
5,I-org,조직(Organization)의 내부
6,I-per,사람(Person)의 내부
7,B-per,사람(Person)의 시작
8,B-gpe,정치적/지리적 단위의 시작
9,I-gpe,정치적/지리적 단위의 내부





Unnamed: 0,POS 태그,설명
0,NN,단수 일반 명사
1,NNP,단수 고유 명사
2,IN,전치사
3,DT,한정사(관사)
4,JJ,형용사
5,NNS,복수 일반 명사
6,.,마침표
7,VBD,과거형 동사
8,",",쉼표
9,VBN,과거분사


In [6]:
import pandas as pd
id2label = {index: value for index, value in ner_df['NER 태그'].items()}
label2id = {value: index for index, value in ner_df['NER 태그'].items()}
label_list = list(label2id.keys())
df['Tag'] = df['Tag'].apply(lambda x: label2id[x] if not pd.isnull(x) else x)

In [10]:
# 16번 태그(기타)가 84% 이상임을 알 수 있다.
# 따라서 모델 설계를 해서 metric이 84$ 이상 나와야 유효한 모델임을 유추할 수 있다.
# 모두 16번만 찍어도 84%가 accuracy로 찍히는 라벨 불균형 상태이기 때문이다.
proportin_df = pd.DataFrame(df['Tag'].value_counts(normalize=True).sort_index())
display(proportin_df)

Unnamed: 0_level_0,proportion
Tag,Unnamed: 1_level_1
0,0.0359
1,0.007071
2,0.019391
3,0.006226
4,0.01921
5,0.016006
6,0.016452
7,0.016203
8,0.015135
9,0.000189


In [7]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, ModernBertModel
import torch
model_id = "distilbert/distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForTokenClassification.from_pretrained(
    model_id, 
    num_labels=len(label_list), 
    id2label=id2label,
    label2id=label2id
    )
model = model.to(device)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
import numpy as np
index_lst = [i for i, row in df.iterrows() if not pd.isna(row['Sentence #'])]

index_lst = np.array(df[~df['Sentence #'].apply(lambda x: pd.isna(x))].index)
index_lst = np.append(index_lst, len(df))
start_index = index_lst[:-1]
end_index = index_lst[1:]

result = []
for start, end in zip(start_index, end_index):
    data = df.iloc[start:end]
    result.append(data)
    
x_list = []
y_list = []
for data in result:
    token_lst = data['Word'].to_list()
    label_lst = data['Tag'].to_list()
    if not (any(pd.isnull(x) for x in token_lst) or any(pd.isnull(x) for x in label_lst)):
        x_list.append(token_lst)
        y_list.append(label_lst)
index_lst = list(range(len(x_list)))

In [None]:
from datasets import Dataset, DatasetDict, Features, Value, Sequence

dataset_dict = {
    'id': index_lst,
    'tokens': x_list,
    'ner_tags': y_list,
}
dataset = Dataset.from_dict(dataset_dict)

dataset

Dataset({
    features: ['id', 'tokens', 'ner_tags'],
    num_rows: 47949
})

In [None]:
def tokenize_and_align_labels(examples):
    # samples의 첫번째는 토큰이 29개입니다. 따라서 라벨도 29개가 나와야 합니다.
    samples = examples

    # 첫번째 샘플의 input_ids의 수는 41입니다. 즉 토큰화된 것을 한 번 더 토크나이징 하는 것입니다. 
    tokenized_inputs  = tokenizer(samples['tokens'], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(samples['ner_tags']):

        # 한 문장 내에서 토큰이 몇 번째 단어에 매핑되는지 알려줍니다. 
        # None은 CLS와 SEP으로 매핑됩니다.
        # input_ids의 수가 41이기 때문에, 라벨 또한 41로 늘려야 합니다. 
        # 현재는 29개이기 때문에 아래의 logic을 거치는 것입니다.
        word_ids = tokenized_inputs.word_ids(batch_index = i)
        previous_word_idx = None
        label_ids = []

        # 아래 로직은 다음 역할을 합니다.
        # cls와 sep엔 -100을 부여합니다.
        # 한 단어에서 잘려나온 토큰에 대해선 첫 번째 토큰을 제외한 나머지 토큰에 대해 -100을 부여합니다.
        # 그 외의 모든 토큰에 대해선 라벨 값을 그대로 사용합니다.
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)
tokenized_dataset

Map:   0%|          | 0/47949 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 47949
})

In [None]:
train_val_test_split = tokenized_dataset.train_test_split(test_size=0.1)
train_val_dataset = train_val_test_split['train']
test_dataset = train_val_test_split['test']

train_val_split = train_val_dataset.train_test_split(test_size=0.13)
train_dataset = train_val_split['train']
val_dataset = train_val_split['test']

dataset_input = DatasetDict({
    'train':train_dataset,
    'val':val_dataset,
    'test':test_dataset,
})
dataset_input

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 37543
    })
    val: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5611
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 4795
    })
})

In [11]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [12]:
import evaluate

seqeval = evaluate.load("seqeval")

In [13]:
import numpy as np

labels = [label_list[i] for i in dataset_input['train'][0][f"ner_tags"]]


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels,  zero_division=0)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [14]:
from transformers import TrainingArguments, Trainer
import torch._dynamo
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch._dynamo.config.suppress_errors = True

training_args = TrainingArguments(
    output_dir=".results/week3_advanced_ner",
    learning_rate=learning_rate,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    num_train_epochs=num_train_epochs,
    weight_decay=weight_decay,
    eval_strategy="epoch",
    save_strategy='epoch',
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to='none'
)

trainer = Trainer(
    model=model, 
    args=training_args,
    train_dataset=dataset_input["train"],
    eval_dataset=dataset_input["val"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1186,0.111187,0.799516,0.814558,0.806967,0.965679
2,0.0925,0.104038,0.813069,0.819338,0.816192,0.967587


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=4694, training_loss=0.12815557015820875, metrics={'train_runtime': 316.9642, 'train_samples_per_second': 236.891, 'train_steps_per_second': 14.809, 'total_flos': 855214290355218.0, 'train_loss': 0.12815557015820875, 'epoch': 2.0})

In [16]:
from pprint import pprint
metrics = trainer.predict(dataset_input["test"])
pprint(metrics)

  _warn_prf(average, modifier, msg_start, len(result))


PredictionOutput(predictions=array([[[-9.65326801e-02, -1.36011994e+00,  4.81627643e-01, ...,
         -1.18671012e+00, -2.15134573e+00,  7.33527184e+00],
        [-6.43327653e-01, -1.35887873e+00,  9.41215232e-02, ...,
         -1.37261772e+00, -2.08675838e+00,  9.89927578e+00],
        [-8.28978539e-01, -1.22003126e+00, -2.02378854e-01, ...,
         -1.42313635e+00, -2.02780652e+00,  1.01041059e+01],
        ...,
        [-1.00000000e+02, -1.00000000e+02, -1.00000000e+02, ...,
         -1.00000000e+02, -1.00000000e+02, -1.00000000e+02],
        [-1.00000000e+02, -1.00000000e+02, -1.00000000e+02, ...,
         -1.00000000e+02, -1.00000000e+02, -1.00000000e+02],
        [-1.00000000e+02, -1.00000000e+02, -1.00000000e+02, ...,
         -1.00000000e+02, -1.00000000e+02, -1.00000000e+02]],

       [[ 6.39653146e-01, -1.44535923e+00,  6.84136450e-01, ...,
         -1.29861951e+00, -2.17158294e+00,  6.30365181e+00],
        [-1.18354425e-01, -1.61225247e+00, -5.10575771e-02, ...,
         