### 환경 설정 및 필요 라이브러리 설치

In [None]:
!pip install wget
!pip install transformers
!pip install torch
!pip install seqeval

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9656 sha256=d963ef5f9f222f2720521bf7f6ff646a7b10f1387d473da204c542f6ff38b55e
  Stored in directory: /root/.cache/pip/wheels/8b/f1/7f/5c94f0a7a505ca1c81cd1d9208ae2064675d97582078e6c769
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nv

### 데이터 준비 및 전처리

In [None]:
import wget

# GitHub에서 파일 다운로드
train_data = wget.download('https://raw.githubusercontent.com/IDIOcoder/Chat-bot/main/dataset/ner_dataset.txt')

In [None]:
# 데이터셋 확인
with open(train_data, 'r', encoding='utf-8') as f:
    for _ in range(10):  # 처음 10줄 출력
        print(f.readline().strip())


밀푀유나베 B-FOOD
는 O
어떻게 O
만들어 O
? O

김치찌개 B-FOOD
레시피 O
좀 O
알려주세요 O


### 데이터 로더 정의

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import ElectraTokenizerFast

# 토크나이저와 태그 맵핑 설정
tokenizer = ElectraTokenizerFast.from_pretrained('monologg/koelectra-base-v3-discriminator')
tag2idx = {"O": 0, "B-FOOD": 1, "I-FOOD": 2}
idx2tag = {v: k for k, v in tag2idx.items()}

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/61.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/263k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/467 [00:00<?, ?B/s]

In [None]:
class NERDataset(Dataset):
    def __init__(self, file_path):
        self.sentences = []
        self.labels = []

        with open(file_path, "r", encoding='utf-8') as f:
            sentence = []
            label = []
            for line in f:
                if line.strip():
                    parts = line.strip().split()
                    if len(parts) == 2:
                        word, tag = parts
                        sentence.append(word)
                        label.append(tag)
                    else:
                        print(f"Warning: Line with incorrect format found: {line.strip()}")
                else:
                    if sentence:
                        self.sentences.append(sentence)
                        self.labels.append(label)
                        sentence = []
                        label = []

        if sentence:
            self.sentences.append(sentence)
            self.labels.append(label)

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        words = self.sentences[idx]
        labels = self.labels[idx]

        encoding = tokenizer(words, is_split_into_words=True, return_offsets_mapping=True, padding='max_length', truncation=True, max_length=128)
        labels = [tag2idx[label] for label in labels]

        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']
        token_type_ids = encoding['token_type_ids']
        offsets = encoding['offset_mapping']

        # Create a new list of labels for each sub-token
        new_labels = []
        current_word_idx = -1
        for offset, input_id in zip(offsets, input_ids):
            if input_id == tokenizer.cls_token_id or input_id == tokenizer.sep_token_id:
                new_labels.append(tag2idx["O"]) # CLS와 SEP 토큰에는 O로 라벨링
            elif input_id == tokenizer.pad_token_id:
                new_labels.append(tag2idx["O"]) # PAD 토큰에 대해 라벨링
            else:
                if offset[0] == 0:
                    current_word_idx += 1
                new_labels.append(labels[current_word_idx])

        # Padding labels to max_length
        max_length = 128
        padded_labels = new_labels[:max_length]
        padded_labels = padded_labels + [tag2idx["O"]] * (max_length - len(padded_labels))

        item = {
            'input_ids': torch.tensor(input_ids),
            'attention_mask': torch.tensor(attention_mask),
            'token_type_ids': torch.tensor(token_type_ids),
            'labels': torch.tensor(padded_labels)
        }

        return item

def collate_fn(batch):
    max_len = max([len(item['input_ids']) for item in batch])

    input_ids = torch.stack([torch.cat([item['input_ids'], torch.zeros(max_len - len(item['input_ids']))]) for item in batch])
    attention_mask = torch.stack([torch.cat([item['attention_mask'], torch.zeros(max_len - len(item['attention_mask']))]) for item in batch])
    token_type_ids = torch.stack([torch.cat([item['token_type_ids'], torch.zeros(max_len - len(item['token_type_ids']))]) for item in batch])
    labels = torch.stack([torch.cat([item['labels'], torch.zeros(max_len - len(item['labels']))]) for item in batch])

    return {
        'input_ids': input_ids.long(),
        'attention_mask': attention_mask.long(),
        'token_type_ids': token_type_ids.long(),
        'labels': labels.long()
    }

# 데이터 로드 및 DataLoader 생성
train_dataset = NERDataset(train_data)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)

# 첫 번째 배치 확인
for batch in train_dataloader:
    input_ids = batch['input_ids'][0]
    labels = batch['labels'][0]
    tokens = tokenizer.convert_ids_to_tokens(input_ids.tolist())

    print("Tokens and labels in the first batch:")
    for token, label in zip(tokens, labels):
        print(f"{token}: {idx2tag[label.item()]}")
    break


Tokens and labels in the first batch:
[CLS]: O
갈비: B-FOOD
##찜: B-FOOD
만드: O
##는: O
방법: O
알려: O
##주: O
##세요: O
.: O
[SEP]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PAD]: O
[PA

### 모델 정의 및 학습

In [None]:
from transformers import ElectraForTokenClassification, AdamW
from tqdm import tqdm

# KoELECTRA 모델 정의
model = ElectraForTokenClassification.from_pretrained('monologg/koelectra-base-v3-discriminator', num_labels=len(tag2idx))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 옵티마이저 설정
optimizer = AdamW(model.parameters(), lr=5e-5)

# 모델 학습
model.train()
for epoch in range(5):  # Epoch 수를 조정하세요
    total_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    print(f"Epoch {epoch+1} Loss: {total_loss/len(train_dataloader)}")


pytorch_model.bin:   0%|          | 0.00/452M [00:00<?, ?B/s]

Some weights of ElectraForTokenClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|██████████| 7/7 [00:01<00:00,  3.65it/s]


Epoch 1 Loss: 0.4575267455407551


Training Epoch 2: 100%|██████████| 7/7 [00:01<00:00,  6.67it/s]


Epoch 2 Loss: 0.05902811512351036


Training Epoch 3: 100%|██████████| 7/7 [00:01<00:00,  6.64it/s]


Epoch 3 Loss: 0.03618169017136097


Training Epoch 4: 100%|██████████| 7/7 [00:01<00:00,  6.61it/s]


Epoch 4 Loss: 0.026074534015996114


Training Epoch 5: 100%|██████████| 7/7 [00:01<00:00,  6.60it/s]

Epoch 5 Loss: 0.0201389736362866





### 모델 평가

In [None]:
# from seqeval.metrics import classification_report

# def evaluate(model, dataloader):
#     model.eval()
#     true_labels = []
#     pred_labels = []

#     with torch.no_grad():
#         for batch in tqdm(dataloader, desc="Evaluating"):
#             batch = {k: v.to(device) for k, v in batch.items()}
#             outputs = model(**batch)
#             logits = outputs.logits
#             predictions = torch.argmax(logits, dim=-1)

#             for i, label in enumerate(batch['labels']):
#                 true_labels.append([idx2tag[label_id.item()] for label_id in label if label_id != -100])
#                 pred_labels.append([idx2tag[pred.item()] for pred in predictions[i] if pred != -100])

#     print(classification_report(true_labels, pred_labels))

# evaluate(model, test_dataloader)

### 예측 함수 정의 및 테스트

In [None]:
def predict(model, sentence):
    model.eval()
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=128)

    with torch.no_grad():
        inputs = {k: v.to(device) for k, v in inputs.items()}
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1).squeeze().tolist()
        print(predictions)

    tokenized_input = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze().tolist())
    print(tokenized_input)

    result = []
    for token, pred in zip(tokenized_input, predictions):
        if token.startswith('##'):
            result[-1][0] += token[2:]
        else:
            result.append([token, idx2tag[pred]])

    return result

# 예제 문장 예측
sentence = "숙주나물볶음 레시피 좀 알려주세요"
result = predict(model, sentence)

# 결과 출력
for word, tag in result:
    print(f"{word}: {tag}")


[0, 1, 1, 0, 0, 0, 0, 0, 0, 0]
['[CLS]', '숙주', '##나물', '##볶음', '레시피', '좀', '알려', '##주', '##세요', '[SEP]']
[CLS]: O
숙주나물볶음: B-FOOD
레시피: O
좀: O
알려주세요: O
[SEP]: O


In [None]:
torch.save(model.state_dict(), 'NER_weights.pth')

In [None]:
from google.colab import files

# 파일 다운로드
files.download('NER_weights.pth')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>