In [None]:
# import requests
# from PIL import Image
# from transformers import BlipProcessor, BlipForConditionalGeneration

# processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
# model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cuda")

# img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
# raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')

# # conditional image captioning
# text = "a photography of"
# inputs = processor(raw_image, text, return_tensors="pt").to("cuda")

# out = model.generate(**inputs)
# print(processor.decode(out[0], skip_special_tokens=True))

In [1]:
import os
import time
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader, ConcatDataset
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

In [2]:
# 모델 & 프로세서 로드
model_name = "Salesforce/blip-image-captioning-base"
processor = BlipProcessor.from_pretrained(model_name)
model = BlipForConditionalGeneration.from_pretrained(model_name).to("cuda")

preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("사용 중인 디바이스:", device)

사용 중인 디바이스: cuda


In [5]:
import os
import pickle
import torch
from torch.utils.data import Dataset
from torchvision import transforms
from PIL import Image

class KTSDatasetTorch(Dataset):
    def __init__(self, data_address, mode="train", transform=None):
        self.data_address = data_address
        self.image_dir = os.path.join(self.data_address, "images")
        self.transform = transform if transform else self.default_transform()

        # pickle 파일 로드
        pickle_file = os.path.join(self.data_address, f"{mode}.pickle")
        with open(pickle_file, "rb") as fr:
            self.dataset = pickle.load(fr)

        # 이미지 경로, 텍스트, 해시태그 추출
        self.image_paths = []
        self.texts = []
        self.tags = []

        for data in self.dataset:
            if "img_name" in data and "hashtag" in data and data["hashtag"] and "text" in data:
                img_path = os.path.join(self.image_dir, data["img_name"].replace("\\", "/").split("images/")[-1])
                if os.path.exists(img_path):
                    self.image_paths.append(img_path)
                    self.texts.append(data["text"])
                    self.tags.append(data["hashtag"])  

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        text = self.texts[idx]
        tag = self.tags[idx]

        image = Image.open(img_path).convert("RGB")
        image = self.transform(image)

        return image, text, tag


    def default_transform(self):
        return transforms.Compose([
            transforms.Resize((84, 84)),
            transforms.ToTensor(),
        ])


In [6]:
# 데이터셋 생성
train_dataset = KTSDatasetTorch(data_address="/kaggle/input/korean-tourist-spot-dataset/Korean-Tourist-Spot-Dataset-master/kts/", mode="train", transform=None )
valid_dataset = KTSDatasetTorch(data_address="/kaggle/input/korean-tourist-spot-dataset/Korean-Tourist-Spot-Dataset-master/kts/", mode="val", transform=None )
test_dataset = KTSDatasetTorch(data_address="/kaggle/input/korean-tourist-spot-dataset/Korean-Tourist-Spot-Dataset-master/kts/", mode="test", transform=None )


# train + valid 합치기
train_valid_dataset = ConcatDataset([train_dataset, valid_dataset])

In [7]:
from torch.utils.data import DataLoader

#  collate_fn 정의: 이미지, 텍스트, 해시태그 리스트를 그대로 묶어줌
def custom_collate_fn(batch):
    images, texts, tags = zip(*batch)
    return list(images), list(texts), list(tags)

#  DataLoader 구성
train_valid_loader = DataLoader(
    train_valid_dataset,
    batch_size=8,
    shuffle=True,
    collate_fn=custom_collate_fn
)
test_loader = DataLoader(
    train_dataset,
    batch_size=8,
    shuffle=True,
    collate_fn=custom_collate_fn
)

In [8]:
sample = train_valid_dataset[-1]
print("타입:", type(sample))
print("길이:", len(sample))

for i, item in enumerate(sample):
    print(f"\n--- sample[{i}] ---")
    print("타입:", type(item))
    print(item)

타입: <class 'tuple'>
길이: 3

--- sample[0] ---
타입: <class 'torch.Tensor'>
tensor([[[0.0745, 0.0745, 0.0745,  ..., 0.0627, 0.0627, 0.0627],
         [0.0745, 0.0745, 0.0745,  ..., 0.0627, 0.0627, 0.0627],
         [0.0745, 0.0745, 0.0745,  ..., 0.0627, 0.0627, 0.0627],
         ...,
         [0.4078, 0.3569, 0.3216,  ..., 0.2118, 0.2235, 0.2039],
         [0.3647, 0.3333, 0.3059,  ..., 0.2157, 0.2784, 0.2941],
         [0.3373, 0.3216, 0.2980,  ..., 0.2196, 0.2627, 0.2392]],

        [[0.0706, 0.0706, 0.0706,  ..., 0.0627, 0.0627, 0.0627],
         [0.0706, 0.0706, 0.0706,  ..., 0.0627, 0.0627, 0.0627],
         [0.0745, 0.0745, 0.0745,  ..., 0.0627, 0.0627, 0.0627],
         ...,
         [0.4157, 0.3569, 0.3176,  ..., 0.2118, 0.2196, 0.2039],
         [0.3725, 0.3333, 0.3020,  ..., 0.2196, 0.2745, 0.2902],
         [0.3412, 0.3216, 0.2980,  ..., 0.2157, 0.2549, 0.2353]],

        [[0.0627, 0.0627, 0.0667,  ..., 0.0588, 0.0549, 0.0549],
         [0.0627, 0.0627, 0.0667,  ..., 0.0549, 0.0

In [9]:
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)

def train_step(batch):
    model.train()
    images, texts, tags = batch

    # processor로 이미지 + 텍스트 인코딩
    inputs = processor(
        images=images,
        text=texts,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=35,
        do_rescale=False
    ).to(device)

    # 정답 해시태그들을 문자열로 변환
    tags_text = [" ".join(tag_list) for tag_list in tags]
    tokenized = processor.tokenizer(
        tags_text,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=35
    )
    labels = tokenized.input_ids.to(device)
    labels[labels == processor.tokenizer.pad_token_id] = -100

    # 모델 학습
    outputs = model(
        input_ids=inputs["input_ids"], 
        attention_mask=inputs["attention_mask"],
        pixel_values=inputs["pixel_values"],
        labels=labels
    )

    loss = outputs.loss
    loss.backward()

    optimizer.step()
    optimizer.zero_grad()

    return loss.item()


In [None]:
import torch

# 저장 경로
model_save_path = "/kaggle/working/best_model_epoch100.pth"

# 학습 파라미터 설정
num_epochs = 100
log_interval = 100
patience_limit = 2
best_loss = float('inf')
patience_counter = 0

for epoch in range(num_epochs):
    total_loss = 0
    model.train()

    for step, batch in enumerate(train_valid_loader):
        loss = train_step(batch)
        total_loss += loss

        if (step + 1) % log_interval == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Step [{step+1}], Loss: {loss:.4f}")

    avg_loss = total_loss / len(train_valid_loader)
    print(f"Epoch {epoch+1} 완료! 평균 손실: {avg_loss:.4f}")

    # Early Stopping 로직
    if avg_loss < best_loss:
        best_loss = avg_loss
        patience_counter = 0
        print(f"손실 개선! best_loss 갱신: {best_loss:.4f}")
        
        # 모델 저장
        torch.save(model.state_dict(), model_save_path)
        print(f"모델 저장됨: {model_save_path}")
    else:
        patience_counter += 1
        print(f"손실 개선 없음. patience: {patience_counter}/{patience_limit}")
        if patience_counter >= patience_limit:
            print("Early stopping triggered!")
            break

Epoch [1/100], Step [100], Loss: 3.5089
Epoch [1/100], Step [200], Loss: 3.3449
Epoch [1/100], Step [300], Loss: 3.5218
Epoch [1/100], Step [400], Loss: 3.1459
Epoch [1/100], Step [500], Loss: 3.0169
Epoch [1/100], Step [600], Loss: 3.7519
Epoch [1/100], Step [700], Loss: 3.1518
Epoch [1/100], Step [800], Loss: 3.7377
Epoch [1/100], Step [900], Loss: 3.0562
Epoch [1/100], Step [1000], Loss: 3.4165
Epoch 1 완료! 평균 손실: 3.5777
손실 개선! best_loss 갱신: 3.5777
모델 저장됨: /kaggle/working/best_model_epoch100.pth
Epoch [2/100], Step [100], Loss: 4.0858
Epoch [2/100], Step [200], Loss: 3.7266
Epoch [2/100], Step [300], Loss: 3.2218
Epoch [2/100], Step [400], Loss: 3.7787
Epoch [2/100], Step [500], Loss: 3.3418
Epoch [2/100], Step [600], Loss: 3.1449
Epoch [2/100], Step [700], Loss: 3.6330
Epoch [2/100], Step [800], Loss: 3.0870
Epoch [2/100], Step [900], Loss: 3.3549
Epoch [2/100], Step [1000], Loss: 3.4856
Epoch 2 완료! 평균 손실: 3.3588
손실 개선! best_loss 갱신: 3.3588
모델 저장됨: /kaggle/working/best_model_epoch10