# Load Packages & Lib.

In [1]:
import torch
from transformers import BertTokenizer, BertForMaskedLM, BertConfig,AutoTokenizer
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os

curr_path = os.getcwd()

# Data Load

In [2]:
# 데이터 세트 불러오기
df = pd.read_csv(os.path.join(curr_path,'artwork_data.csv')) # 환경에 맞게 수정해주십쇼!

# 몇 개 샘플 시각화
print(df.head())

       Artist Name                                      Artwork Title  \
0  Berenice Abbott                                       Eugène Atget   
1  Berenice Abbott                                Jean Cocteau, Paris   
2  Berenice Abbott                                Nightview, New York   
3  Berenice Abbott  Canyon: Broadway and Exchange Place, New York ...   
4  Berenice Abbott  Gunsmith and Police Department Headquarters, 6...   

                                 Artwork Description  
0  In this portrait, Eugène Atget, with a bemused...  
1  Berenice Abbott portrays Jean Cocteau, French ...  
2  Abbott's photograph depicts the city of New Yo...  
3  The image focuses on the facade of the Exchang...  
4  This image of a shop sign captures Abbott's fa...  


In [3]:
# 데이터셋 클래스 정의
class ArtDataset(Dataset):
    def __init__(self, descriptions, tokenizer, max_length):
        self.descriptions = descriptions
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.descriptions)

    def __getitem__(self, idx):
        description = self.descriptions[idx]
        inputs = self.tokenizer.encode_plus(
            description,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt"
        )
        return inputs['input_ids'].squeeze(0), inputs['attention_mask'].squeeze(0)


# 데이터 로드
descriptions = df['Artwork Description'].tolist()  # 설명 텍스트 컬럼 이름

In [4]:
# 토큰화
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 512  # BERT 입력 최대 길이

In [5]:
# 데이터셋 및 데이터로더 준비
dataset = ArtDataset(descriptions, tokenizer, max_length)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Model

In [6]:
# BERT MLM 모델 로드
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [8]:
epochs = 200
# 추가 필요 변수 정의
# 모델과 토크나이저를 저장할 디렉토리 생성
model_save_dir = os.path.join(curr_path,'models')
os.makedirs(model_save_dir, exist_ok=True)
best_model_path = os.path.join(model_save_dir,'best_model.pth')  # 최상의 모델을 저장할 경로
tokenizer_save_path = os.path.join(model_save_dir, 'tokenizer')

optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

best_loss = float('inf')
patience = 3
no_improvement = 0


In [9]:
# 모델 학습 진행
model.train()
for epoch in range(epochs):
    train_loss = 0
    for batch in train_loader:
        input_ids, attention_mask = batch
        input_ids, attention_mask = input_ids.cuda(), attention_mask.cuda()

        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        torch.cuda.empty_cache()  # GPU 캐시 비우기

    # 에포크당 평균 손실 계산
    avg_train_loss = train_loss / len(train_loader)

    # 검증 데이터셋에서 성능 평가
    model.eval()
    with torch.no_grad():
        val_loss = 0
        for batch in val_loader:
            input_ids, attention_mask = batch
            input_ids, attention_mask = input_ids.cuda(), attention_mask.cuda()
            outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
            val_loss += outputs.loss.item()

    avg_val_loss = val_loss / len(val_loader)

    print(f"Epoch {epoch+1}/{epochs}, Training Loss: {avg_train_loss}, Validation Loss: {avg_val_loss}")

    if avg_val_loss < best_loss:
        best_loss = avg_val_loss
        no_improvement = 0
        torch.save(model.state_dict(), best_model_path)  # 모델 상태 저장
        tokenizer.save_pretrained(tokenizer_save_path)  # 토크나이저 저장
        print("Saved Best Model and Tokenizer")
    else:
        no_improvement+=1
    # Early Stopping 체크
    if no_improvement == patience:
        print("Early stopping triggered")
        break

Epoch 1/200, Training Loss: 1.0301819561021541, Validation Loss: 0.06786366245325874
Saved Best Model and Tokenizer
Epoch 2/200, Training Loss: 0.03576318872397515, Validation Loss: 0.019984886488493753
Saved Best Model and Tokenizer
Epoch 3/200, Training Loss: 0.015480947854525563, Validation Loss: 0.010959754478843772
Saved Best Model and Tokenizer
Epoch 4/200, Training Loss: 0.00954994658675434, Validation Loss: 0.007225397823597579
Saved Best Model and Tokenizer
Epoch 5/200, Training Loss: 0.0067068501757874864, Validation Loss: 0.005237742217586321
Saved Best Model and Tokenizer
Epoch 6/200, Training Loss: 0.004997071799641447, Validation Loss: 0.003979321836274774
Saved Best Model and Tokenizer
Epoch 7/200, Training Loss: 0.0038944634178013945, Validation Loss: 0.0031657014863894265
Saved Best Model and Tokenizer
Epoch 8/200, Training Loss: 0.0031476177765167694, Validation Loss: 0.0025938077802386355
Saved Best Model and Tokenizer
Epoch 9/200, Training Loss: 0.002602507182181493

In [10]:

# 모델 및 토크나이저 불러오기
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.load_state_dict(torch.load(best_model_path))
model.cuda()

tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_path)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
# 모델 평가
model.eval()
total_loss = 0
with torch.no_grad():
    for batch in val_loader:
        input_ids, attention_mask = batch
        input_ids, attention_mask = input_ids.cuda(), attention_mask.cuda()

        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        total_loss += outputs.loss.item()

avg_loss = total_loss / len(val_loader)
print(f"Validation Loss: {avg_loss}")

Validation Loss: 2.9740003008234507e-05


## 사전학습 모델 저장

In [13]:
# # 모델 저장하기

# model_save_path = "" # 환경에 맞게 수정해주십쇼!
# model.save_pretrained(model_save_path)

# # 토크나이저 저장하기
# tokenizer_save_path = "저장할 경로" # 환경에 맞게 수정해주십쇼!
# tokenizer.save_pretrained(tokenizer_save_path)

## cf) 나중에 저장된 정보 불러오는 법

In [14]:
'''
# 저장된 모델 불러오기
model = BertForMaskedLM.from_pretrained(model_save_path)

# 저장된 토크나이저 불러오기
tokenizer = BertTokenizer.from_pretrained(tokenizer_save_path)
'''

'\n# 저장된 모델 불러오기\nmodel = BertForMaskedLM.from_pretrained(model_save_path)\n\n# 저장된 토크나이저 불러오기\ntokenizer = BertTokenizer.from_pretrained(tokenizer_save_path)\n'