<a href="https://colab.research.google.com/github/KwonDuHyeon/hanghae99/blob/main/3%EC%A3%BC%EC%B0%A8%EA%B8%B0%EB%B3%B8%EA%B3%BC%EC%A0%9C(%EA%B6%8C%EB%91%90%ED%98%84).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##[MY CODE] 라이브러리 import 및 fancyzhx/ag_news Load

In [None]:
# !pip install tqdm boto3 requests regex sentencepiece sacremoses datasets #로컬로 사용

In [None]:
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader

tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'distilbert-base-uncased')

Using cache found in /home/duhyeon/.cache/torch/hub/huggingface_pytorch-transformers_main


In [None]:
ds = load_dataset("fancyzhx/ag_news")  # ag_news 데이터로 변경


def collate_fn(batch):
  max_len = 400
  texts, labels = [], []
  for row in batch:
    labels.append(row['label'])
    texts.append(row['text'])

  texts = torch.LongTensor(tokenizer(texts, padding=True, max_length=max_len).input_ids) # truncation 옵션 제거
  labels = torch.LongTensor(labels)

  return texts, labels


train_loader = DataLoader(
    ds['train'], batch_size=64, shuffle=True, collate_fn=collate_fn
)
test_loader = DataLoader(
    ds['test'], batch_size=64, shuffle=False, collate_fn=collate_fn
)

이제 pre-trained DistilBERT를 불러옵니다. 이번에는 PyTorch hub에서 제공하는 DistilBERT를 불러봅시다.

In [None]:
model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'distilbert-base-uncased')
model

Using cache found in /home/duhyeon/.cache/torch/hub/huggingface_pytorch-transformers_main


DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0): TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Linear(i

In [None]:
from torch import nn


class TextClassifier(nn.Module):
  def __init__(self):
    super().__init__()

    self.encoder = torch.hub.load('huggingface/pytorch-transformers', 'model', 'distilbert-base-uncased')
    self.classifier = nn.Linear(768, 4)  # AG News는 4개의 클래스 사용

  def forward(self, x):
    x = self.encoder(x)['last_hidden_state']
    x = self.classifier(x[:, 0])

    return x


model = TextClassifier()

Using cache found in /home/duhyeon/.cache/torch/hub/huggingface_pytorch-transformers_main


In [None]:
for param in model.encoder.parameters():
  param.requires_grad = False

## [MY CODE] CrossEntropyLoss 추가

In [None]:
from torch.optim import Adam
import numpy as np
import matplotlib.pyplot as plt
import os

# 학습된 모델 저장 디렉토
save_dir = "/mnt/d/hanghae99/3rd/base/"
os.makedirs(save_dir, exist_ok=True)

lr = 0.001
model = model.to('cuda')
#  loss_fn = nn.BCEWithLogitsLoss()

criterion = nn.CrossEntropyLoss() # CrossEntropyLoss 정의
optimizer = Adam(model.parameters(), lr=lr)
n_epochs = 10

for epoch in range(n_epochs):
  total_loss = 0.
  model.train()

  for data in train_loader:
    model.zero_grad()
    inputs, labels = data
    inputs, labels = inputs.to('cuda'), labels.to('cuda').long() # 정수형 레이블

    preds = model(inputs) # Logits 형태로 출력
    loss = criterion(preds, labels)
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

  save_path = os.path.join(save_dir, f"epoch_{epoch + 1}.pth") # 학습 모델 저장
  torch.save(model.state_dict(), save_path)

  print(f"Epoch {epoch:3d} | Train Loss: {total_loss}")

Epoch   0 | Train Loss: 884.5469363629818
Epoch   1 | Train Loss: 697.6648135110736
Epoch   2 | Train Loss: 672.7772943526506
Epoch   3 | Train Loss: 659.3012258708477
Epoch   4 | Train Loss: 650.2033046931028
Epoch   5 | Train Loss: 643.3497982025146
Epoch   6 | Train Loss: 642.2062663212419
Epoch   7 | Train Loss: 641.7624578252435
Epoch   8 | Train Loss: 639.5016624033451
Epoch   9 | Train Loss: 634.8990609422326


## [MY CODE] CrossEntropyLoss 맞게 accuracy 수정

In [None]:
train_acc_list = []
test_acc_list = []


def accuracy(model, dataloader):
  cnt = 0
  acc = 0

  for data in dataloader:
    inputs, labels = data
    inputs, labels = inputs.to('cuda'), labels.to('cuda').long() # 정수 타입으로 변경

    preds = model(inputs)
    preds = torch.argmax(preds, dim=-1) # 가장 높은 로짓 값을 가지는 클래스 선택
    # preds = (preds > 0).long()[..., 0]

    cnt += labels.shape[0]
    acc += (labels == preds).sum().item()

  return acc / cnt


with torch.no_grad():
  model.eval()
  train_acc = accuracy(model, train_loader)
  test_acc = accuracy(model, test_loader)
  train_acc_list.append(train_acc)
  test_acc_list.append(test_acc)
  print(f"=========> Train acc: {train_acc:.3f} | Test acc: {test_acc:.3f}")



## [MY CODE] 학습 결과 테스트

In [None]:
def load_model(filepath, model_class):
    model = model_class()
    model.load_state_dict(torch.load(filepath))
    model = model.to('cuda')
    model.eval()
    return model

def preprocess_text(text, tokenizer, max_length=400):
    tokenized = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    return tokenized['input_ids'].to('cuda'), tokenized['attention_mask'].to('cuda')


# 텍스트 예측 함수
def predict_text(model, tokenizer, text, label_map):
    inputs, attention_mask = preprocess_text(text, tokenizer)
    with torch.no_grad():
        outputs = model(inputs)
        predicted_class = torch.argmax(outputs, dim=-1).item()
    return label_map[predicted_class]


model_path = '/mnt/d/hanghae99/3rd/base/epoch_10.pth'
load_model(model_path, TextClassifier)
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'distilbert-base-uncased')
label_map = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}

text = "The latest advancements in AI have revolutionized technology."
predicted_label = predict_text(model, tokenizer, text, label_map)
print(f"Predicted label: {predicted_label}")

Using cache found in /home/duhyeon/.cache/torch/hub/huggingface_pytorch-transformers_main
Using cache found in /home/duhyeon/.cache/torch/hub/huggingface_pytorch-transformers_main


Predicted label: Sci/Tech
