In [1]:
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
import torch
import torch.nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm
torch.cuda.set_per_process_memory_fraction(11/24)

In [2]:
torch.cuda.set_per_process_memory_fraction(11/24)

## 1. 데이터 로드
train/test 로 구성되어 있는 것을 볼 수 있습니다. 

In [3]:
imdb = load_dataset('imdb')
imdb

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

## validation을 위한 데이터 샘플링

In [4]:
train_valid = imdb['train']
test = imdb['test']
# split train/validation
train_valid = train_valid.train_test_split(test_size=0.1)
train = train_valid['train']
valid = train_valid['test']

In [5]:
pos_neg_count = [0, 0]
for example in train:
    pos_neg_count[example['label']] += 1
print(f"label distribution for train dataset : {pos_neg_count}")

label distribution for train dataset : [11260, 11240]


In [6]:
pos_neg_count = [0, 0]
for example in valid:
    pos_neg_count[example['label']] += 1
print(f"label distribution for train dataset : {pos_neg_count}")

label distribution for train dataset : [1240, 1260]


In [7]:
print(f"train sample text : {train[0]['text']}")
print(f"train sample label : {train[0]['label']}") # 0: negative, 1: positive

train sample text : I first saw this absolutely riveting documentary in it's initial release back in 2001,and it really had a profound effect on me, so much that I bugged several of my friends to see it with me on repeat screenings. The bottom line:none of my friends walked away disappointed (ever!). This stellar film is about Scottish conceptual artist, Andy Goldsworthy,who creates some absolutely beautiful pieces of art using natural materials (wood,water,flowers,rocks,etc.)to create pieces that eventually return to their natural form (a statement in the temporary state of everything?). We get to see Goldsworthy create several works of temporary art,as well as some of his long term installations in major galleries around the world,as well as a few pieces in the natural world,as well. German film maker,Thomas Riedelsheimer directs,photographs & edits this meditation on the creative process that is a real treat for both the eye & ear (with an ambient musical score,composed & performed 

# 2. 단순 Model, Tokenizer 로드
Pretrained LM을 사용할 때는 함께 학습된 tokenizer를 이용해야 합니다. 

In [8]:
model = AutoModel.from_pretrained("bert-base-uncased", add_pooling_layer=False)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

2024-02-02 02:48:29.145984: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['bert.pooler.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'bert.pooler.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing B

## NLP에서 모델 입력은 다음 순서로 이루어집니다. 
1. 원본 텍스트
2. Tokenized Input : Tokenizer를 통과하여 모델의 입력으로 전처리된 데이터
3. Embedding or Contextulaized Vector or Hidden Representation : 모델을 통과하여 생성된 벡터

In [12]:
input_text = train[0]['text'][:10]
tokenized_text = tokenizer.tokenize(input_text)
tokenized_text_with_special_tokens = tokenizer.tokenize(input_text, add_special_tokens=True)
print(f"original text : {input_text}")
print(f"tokenized text : {tokenized_text}")
print(f"tokenized text with special tokens : {tokenized_text_with_special_tokens}")


original text : I first sa
tokenized text : ['i', 'first', 'sa']
tokenized text with special tokens : ['[CLS]', 'i', 'first', 'sa', '[SEP]']


In [17]:
input_text = train[0]['text'][:10]
tokenized_input = tokenizer(input_text, return_tensors='pt')
for key, value in tokenized_input.items():
    print(f"{key} : {value[0]}")

# input_ids : 각 토큰의 vocab id(embedding layer의 해당 토큰 vector와 맵핑)
# token_type_ids : segment embedding을 위한 id (BERT 등 특정 모델에서만 사용)
# attention_mask : attention을 위한 mask (Transformer 기반 모델들은 공통적으로 사용, 사실상 NLP 모든 모델에서 사용)

input_ids : tensor([ 101, 1045, 2034, 7842,  102])
token_type_ids : tensor([0, 0, 0, 0, 0])
attention_mask : tensor([1, 1, 1, 1, 1])


In [16]:
tokenized_txt = tokenizer(input_text)['input_ids']
print(f"tokenized text : {tokenized_txt}")
tokenizer.decode(tokenized_txt)

tokenized text : [101, 1045, 2034, 7842, 102]


'[CLS] i first sa [SEP]'

In [11]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

입력된 데이터는 token_id -> embedding -> transformer blocks -> representation으로 산출됩니다. 

# 3. From Text to Logit 

In [50]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
optimizer.step()

In [51]:
model.train()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [52]:
tokenized_input = tokenizer(train[0]['text'], return_tensors='pt')

In [53]:
representation = model(**tokenized_input) # 모델 입력으로 input_ids, token_type_ids, attention_mask를 받습니다.
representation # 실제로는 NLP 모델들은 여러가지 output을 받아볼 수 있도록 설정되어 있어, 필요한 경우 중간 layer representation, attention map 등도 간단히 return이 가능합니다. 

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.1989, -0.0048,  0.2971,  ..., -0.2877,  0.5979,  0.7716],
         [ 0.0737, -0.1801, -0.1338,  ...,  0.0100,  0.6722,  0.2568],
         [-0.2193, -0.5881, -0.6665,  ...,  0.1343,  0.1884, -0.0985],
         ...,
         [ 0.4321,  0.1643, -0.2640,  ..., -0.1180,  0.5758, -0.2244],
         [-0.3143, -0.6689,  0.0094,  ...,  0.4401,  0.3329, -0.6912],
         [-0.0704,  0.6689, -0.1986,  ...,  0.3935,  0.3162,  0.0596]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=None, hidden_states=None, past_key_values=None, attentions=None, cross_attentions=None)

In [54]:
representation['last_hidden_state'].shape # (batch_size, sequence_length, hidden_size)
# 각 토큰의 representation이 산출되는 것을 확인할 수 있습니다. 

torch.Size([1, 375, 768])

## 하지만 우리가 원하는 것은 하나의 입력 sample에 대한 긍부정 분류 결과일 겁니다. 
## 이를 위해 linear layer를 통과시켜 logit값을 만들어봅시다. 

In [55]:
classification_head = torch.nn.Linear(768, 2) # bert hidden size -> binary classification
pooled_output = torch.mean(representation['last_hidden_state'], dim=1) # 입력 토큰의 모든 representation의 평균을 취합니다.
print(f"Pooled Output for a Sample : {pooled_output.shape}")
logits = classification_head(pooled_output)
print(f"Logit for a Sample : {logits.shape}") # binary classification을 위한 logit이 산출됩니다.

Pooled Output for a Sample : torch.Size([1, 768])
Logit for a Sample : torch.Size([1, 2])


# 4. Loss Calculation and BackProp for Train

In [56]:
loss_fn = torch.nn.CrossEntropyLoss()
loss = loss_fn(logits, torch.tensor([train[0]['label']]))
print(f"Loss for a Sample : {loss}")

Loss for a Sample : 0.9723962545394897


In [57]:
loss.backward()

In [58]:
model.embeddings.word_embeddings.weight.grad.shape # (vocab_size, hidden_size)  

torch.Size([30522, 768])

In [59]:
classification_head.weight.grad.shape # (num_labels, hidden_size)
classification_head.weight.grad

tensor([[-0.0612,  0.1357,  0.1675,  ..., -0.0110,  0.2155, -0.0380],
        [ 0.0612, -0.1357, -0.1675,  ...,  0.0110, -0.2155,  0.0380]])

In [60]:
# 첫번째 layer의 attention value linear layer의 weight를 확인해보겠습니다.
print("Weight for Attn Value Linear Layer") 
print(f"Shape : {model.base_model.encoder.layer[0].attention.self.value.weight.shape}")
print(model.base_model.encoder.layer[0].attention.self.value.weight)

# loss 역전파를 통해 계산된 gradient 역시 아래와 같습니다.
print("Gradient for Attn Value Linear Layer")
print(f"Shape : {model.base_model.encoder.layer[0].attention.self.value.weight.grad.shape}")
print(model.base_model.encoder.layer[0].attention.self.value.weight.grad)


Weight for Attn Value Linear Layer
Shape : torch.Size([768, 768])
Parameter containing:
tensor([[ 1.1403e-02,  1.1445e-03, -9.5199e-03,  ...,  2.3602e-02,
         -2.7326e-02,  9.3509e-05],
        [-2.6590e-02, -1.2196e-02, -4.2571e-02,  ...,  5.9239e-02,
         -1.1393e-02, -2.5042e-02],
        [ 1.9075e-02, -2.3710e-02,  3.1961e-02,  ...,  5.6482e-03,
         -3.7891e-02, -2.2433e-02],
        ...,
        [-3.1478e-02, -2.0522e-02,  3.1003e-02,  ..., -3.9800e-02,
         -5.0545e-02, -5.9328e-03],
        [-6.6795e-03,  4.2564e-02,  4.7859e-02,  ...,  2.5147e-02,
          1.6209e-02, -8.9153e-03],
        [-1.9984e-03, -2.4571e-02,  5.6221e-03,  ..., -4.9603e-02,
         -5.8003e-02, -3.3787e-02]], requires_grad=True)
Gradient for Attn Value Linear Layer
Shape : torch.Size([768, 768])
tensor([[ 1.3102e-04, -1.4789e-03, -9.4285e-04,  ...,  4.8157e-04,
         -5.4509e-04,  3.9943e-05],
        [-1.6418e-04,  1.2516e-04,  2.0550e-04,  ...,  2.5226e-05,
         -3.5675e-05, 

### Optimizer Step을 통해 모델을 업데이트시킵시다. 

In [61]:
optimizer.step()

In [62]:
# 첫번째 layer의 attention value linear layer의 weight를 확인해보겠습니다.
print("Weight for Attn Value Linear Layer") 
print(f"Shape : {model.base_model.encoder.layer[0].attention.self.value.weight.shape}")
print(model.base_model.encoder.layer[0].attention.self.value.weight)

# loss 역전파를 통해 계산된 gradient 역시 아래와 같습니다.
print("Gradient for Attn Value Linear Layer")
print(f"Shape : {model.base_model.encoder.layer[0].attention.self.value.weight.grad.shape}")
print(model.base_model.encoder.layer[0].attention.self.value.weight.grad)


Weight for Attn Value Linear Layer
Shape : torch.Size([768, 768])
Parameter containing:
tensor([[ 1.1393e-02,  1.1544e-03, -9.5100e-03,  ...,  2.3592e-02,
         -2.7316e-02,  8.4646e-05],
        [-2.6581e-02, -1.2205e-02, -4.2581e-02,  ...,  5.9242e-02,
         -1.1396e-02, -2.5051e-02],
        [ 1.9065e-02, -2.3706e-02,  3.1970e-02,  ...,  5.6574e-03,
         -3.7900e-02, -2.2423e-02],
        ...,
        [-3.1468e-02, -2.0527e-02,  3.0994e-02,  ..., -3.9791e-02,
         -5.0536e-02, -5.9266e-03],
        [-6.6695e-03,  4.2570e-02,  4.7859e-02,  ...,  2.5155e-02,
          1.6215e-02, -8.9213e-03],
        [-2.0083e-03, -2.4570e-02,  5.6318e-03,  ..., -4.9595e-02,
         -5.8012e-02, -3.3781e-02]], requires_grad=True)
Gradient for Attn Value Linear Layer
Shape : torch.Size([768, 768])
tensor([[ 1.3102e-04, -1.4789e-03, -9.4285e-04,  ...,  4.8157e-04,
         -5.4509e-04,  3.9943e-05],
        [-1.6418e-04,  1.2516e-04,  2.0550e-04,  ...,  2.5226e-05,
         -3.5675e-05, 

In [63]:
optimizer.zero_grad()

In [65]:
# 첫번째 layer의 attention value linear layer의 weight를 확인해보겠습니다.
print("Weight for Attn Value Linear Layer") 
print(f"Shape : {model.base_model.encoder.layer[0].attention.self.value.weight.shape}")
print(model.base_model.encoder.layer[0].attention.self.value.weight)

# Zero Grad로 Gradient를 지웠으므로 이제 gradient는 None입니다.
print("Gradient for Attn Value Linear Layer")
print(f"Shape : {model.base_model.encoder.layer[0].attention.self.value.weight.grad.shape}")
print(model.base_model.encoder.layer[0].attention.self.value.weight.grad)


Weight for Attn Value Linear Layer
Shape : torch.Size([768, 768])
Parameter containing:
tensor([[ 1.1393e-02,  1.1544e-03, -9.5100e-03,  ...,  2.3592e-02,
         -2.7316e-02,  8.4646e-05],
        [-2.6581e-02, -1.2205e-02, -4.2581e-02,  ...,  5.9242e-02,
         -1.1396e-02, -2.5051e-02],
        [ 1.9065e-02, -2.3706e-02,  3.1970e-02,  ...,  5.6574e-03,
         -3.7900e-02, -2.2423e-02],
        ...,
        [-3.1468e-02, -2.0527e-02,  3.0994e-02,  ..., -3.9791e-02,
         -5.0536e-02, -5.9266e-03],
        [-6.6695e-03,  4.2570e-02,  4.7859e-02,  ...,  2.5155e-02,
          1.6215e-02, -8.9213e-03],
        [-2.0083e-03, -2.4570e-02,  5.6318e-03,  ..., -4.9595e-02,
         -5.8012e-02, -3.3781e-02]], requires_grad=True)
Gradient for Attn Value Linear Layer


AttributeError: 'NoneType' object has no attribute 'shape'

# 5. Overall Train Pipeline
## HuggingFace를 적절히 활용하면 아래처럼 학습 코드를 단순화할 수 있습니다. 

In [1]:
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
import torch
import torch.nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import wandb
import logging

from tqdm import tqdm
import os
torch.cuda.set_per_process_memory_fraction(11/24)

# set seed
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.cuda.manual_seed_all(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


In [2]:
# wandb 사이트에서 authorization key를 발급받아 입력합니다. 
# script로 작업 시에는 명령어 창에서 입력하면 되어서 별도 login이 필요하지 않습니다. 
wandb.login(key='bfe6b67a5bdd260c5771d108854328d7e7698267')

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjaehee_kim[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/jaehee/.netrc


True

In [3]:
model = AutoModel.from_pretrained("bert-base-uncased", add_pooling_layer=False)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)
classification_head = torch.nn.Linear(768, 2)
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

2024-02-02 05:55:03.669962: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['bert.pooler.dense.bias', 'cls.seq_relationship.weight', 'bert.pooler.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing B

In [4]:
imdb = load_dataset('imdb')
train_valid = imdb['train']
test = imdb['test']
# split train/validation
train_valid = train_valid.train_test_split(test_size=0.1)
train = train_valid['train']
valid = train_valid['test']

In [5]:
class IMDBDataset(Dataset):
    def __init__(self, data):
        self.data = data.to_dict()

    def __getitem__(self, idx):
        input_data = {}
        input_data['input_ids'] = self.data['input_ids'][idx]
        input_data['token_type_ids'] = self.data['token_type_ids'][idx]
        input_data['attention_mask'] = self.data['attention_mask'][idx]
        input_data['label'] = self.data['label'][idx]
        return input_data

    def __len__(self):
        return len(self.data['input_ids'])
    
def collate_fn(batch):
    data_dict = {"input_ids" : [], "token_type_ids" : [], "attention_mask" : [], "label" : []}
    for data in batch : 
        data_dict['input_ids'].append(data['input_ids'])
        data_dict['token_type_ids'].append(data['token_type_ids'])
        data_dict['attention_mask'].append(data['attention_mask'])
        data_dict['label'].append(data['label'])
    data_dict['input_ids'] = torch.tensor(data_dict['input_ids'])
    data_dict['token_type_ids'] = torch.tensor(data_dict['token_type_ids'])
    data_dict['attention_mask'] = torch.tensor(data_dict['attention_mask'])
    data_dict['label'] = torch.tensor(data_dict['label'])
    return data_dict

In [6]:
# 1. dataset tokenization
tokenized_train = train.map(lambda example: tokenizer(example['text'], truncation=True, padding='max_length'))
tokenized_valid = valid.map(lambda example: tokenizer(example['text'], truncation=True, padding='max_length'))
tokenized_test = test.map(lambda example: tokenizer(example['text'], truncation=True, padding='max_length'))

Map:   0%|          | 0/22500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

In [7]:
# 2. dataloader
train_dataloader = DataLoader(IMDBDataset(tokenized_train), batch_size=8, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(IMDBDataset(tokenized_valid), batch_size=4, shuffle=False, collate_fn=collate_fn)
test_dataloader = DataLoader(IMDBDataset(tokenized_test), batch_size=4, shuffle=False, collate_fn=collate_fn)

In [8]:
for batch in train_dataloader:
    print(batch['input_ids'].shape)
    print(batch['token_type_ids'].shape)
    print(batch['attention_mask'].shape)
    print(batch['label'].shape)
    break

torch.Size([8, 512])
torch.Size([8, 512])
torch.Size([8, 512])
torch.Size([8])


In [9]:
for key, value in tokenized_train[0].items():
    print(f"{key} : {value}")

text : While movie titles contains the word 'Mother', the first thing that comes to our mind will be a mother's love for her children.<br /><br />However, The Mother tells a different story.<br /><br />The Mother do not discuss the love between a mother and her child, or how she sacrifice herself for the benefit of her child. Here, Notting Hill director Roger Michell tells us how a mother's love for a man about half of her age hurts the people around her.<br /><br />Before Daniel Craig takes on the role of James Bond, here, he plays Darren, a man who is helping to renovate the house of the son of the mother, and sleeping with her daughter as well. Anne Reid, who was a familiar face on TV series, takes up the challenging role of the leading character, May.<br /><br />The story begins with May coping with the sudden loss of her husband, Toots, in a family visit to her son, Bobby. While she befriends Darren, a handyman who is doing some renovation in Bobby's house, she was shocked to foun

In [None]:
EPOCH = 3
step = 0
model.to('cuda')
classification_head.to('cuda')

# tqdm을 사용할 때 전체 step 수를 계산하기 위해 EPOCH * len(train_dataloader)를 사용합니다.
counter = tqdm(range(EPOCH*len(train_dataloader)), desc="Training :")
# wandb에서 실험 내용을 확인할 수 있도록 wandb init을 수행합니다.
# 이때 실험에 사용된 다양한 hyperparameter들을 config로 함께 기록합니다.
wandb.init(
    project="dsba_pretrain_nlp_exp1", 
    name='[imdb] bert-base-uncased',
    config={
        "model": "bert-base-uncased",
        "optimizer": "Adam",
        "lr": 1e-5,
        "batch_size": 8,
        "epoch": EPOCH,
        "max_seq_length": 512
    })

logging_dir = "log"
if os.path.exists(logging_dir) == False:
    os.makedirs(logging_dir)
logging.basicConfig(
    filename=f"{logging_dir}/[imdb] bert-base-uncased-train.log",
    level=logging.INFO,
    format="%(asctime)s:%(levelname)s:%(message)s"
)
logger = logging.getLogger()

for epoch in range(EPOCH):
    for batch in train_dataloader:
        # 학습에 사용할 데이터를 cuda로 옮깁니다.
        label = batch['label'].to('cuda')
        model_input = {key: value.to('cuda') for key, value in batch.items() if key != 'label'}
        # 이전 step에서 계산된 gradient를 지웁니다.
        optimizer.zero_grad()
        # pretrained lm을 통과시켜 representation을 산출합니다.
        representation = model(**model_input)
        # representation을 이용해 classification을 수행합니다.
        pooled_output = torch.mean(representation['last_hidden_state'], dim=1)
        logits = classification_head(pooled_output)
        # loss를 계산하고, 역전파를 수행합니다.
        train_loss = loss_fn(logits, label)
        train_loss.backward()
        optimizer.step()
        step += 1
        counter.update(1)
        # wandb에 loss를 기록합니다.
        wandb.log({"loss": train_loss})
        if step % 100 == 0:
            print(f"step : {step}, loss : {train_loss}")
            logging.info(f"step : {step}, loss : {train_loss}")

        # validation set을 이용해 모델의 성능을 확인합니다.
        if step % 100 == 0:
            with torch.no_grad():
                model.eval()
                classification_head.eval()
                valid_loss = 0
                valid_step = 0
                valid_acc = []
                for batch in valid_dataloader:
                    label = batch['label'].to('cuda')
                    model_input = {key: value.to('cuda') for key, value in batch.items() if key != 'label'}
                    representation = model(**model_input)
                    pooled_output = torch.mean(representation['last_hidden_state'], dim=1)
                    logits = classification_head(pooled_output)
                    loss = loss_fn(logits, label)
                    valid_loss += loss.item()
                    valid_step += 1
                    valid_acc.append(torch.argmax(logits, dim=-1) == label)
                print(f"valid loss : {valid_loss/valid_step}")
                print(f"valid acc : {torch.cat(valid_acc, dim=0).float().mean()}")
                # wandb에 valid loss와 valid accuracy를 기록합니다.
                wandb.log({"valid_loss": valid_loss/valid_step, "valid_acc": torch.cat(valid_acc, dim=0).float().mean()})
                logger.info(f"valid loss : {valid_loss/valid_step}")
                model.train()
                classification_head.train()

        if step % 1000 == 0: # 1000 step마다 모델을 저장합니다.
            if os.path.exists(f"log/model_log/{step}") == False:
                os.makedirs(f"log/model_log/{step}", exist_ok=True)
            torch.save(model.state_dict(), f"log/model_log/{step}/bert_model_{step}.pt")
            torch.save(classification_head.state_dict(), f"log/model_log/{step}/bert_classification_head_{step}.pt")

# 모델 학습이 끝나면 test set을 이용해 모델의 성능을 확인합니다.
with torch.no_grad() :
    model.eval()
    classification_head.eval()
    test_loss = 0
    test_step = 0
    test_acc = []
    for batch in tqdm(test_dataloader):
        label = batch['label'].to('cuda')
        model_input = {key: value.to('cuda') for key, value in batch.items() if key != 'label'}
        representation = model(**model_input)
        pooled_output = torch.mean(representation['last_hidden_state'], dim=1)
        logits = classification_head(pooled_output)
        loss = loss_fn(logits, label)
        test_loss += loss
        test_step += 1
        test_acc.append(torch.argmax(logits, dim=-1) == label)

print(f"test loss : {test_loss/test_step}")
print(f"test acc : {torch.cat(test_acc, dim=0).float().mean()}")
wandb.log({"test_loss": test_loss/test_step, "test_acc": torch.cat(test_acc, dim=0).float().mean()})
logger.info(f"test loss : {test_loss/test_step}")

if step % 1000 == 0: # 1000 step마다 모델을 저장합니다.
    if os.path.exists(f"log/model_log/{step}") == False:
        os.makedirs(f"log/model_log/{step}", exist_ok=True)
torch.save(model.state_dict(), f"log/model_log/{step}/bert_model_last.pt")
torch.save(classification_head.state_dict(), f"log/model_log/{step}/bert_classification_head_last.pt")