In [1]:
#open_clip, transformers 설치
!pip install open_clip_torch

Collecting open_clip_torch
  Downloading open_clip_torch-2.28.0-py3-none-any.whl.metadata (31 kB)
Collecting ftfy (from open_clip_torch)
  Downloading ftfy-6.3.0-py3-none-any.whl.metadata (7.1 kB)
Downloading open_clip_torch-2.28.0-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ftfy-6.3.0-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ftfy, open_clip_torch
Successfully installed ftfy-6.3.0 open_clip_torch-2.28.0


In [2]:
!pip install transformers



In [3]:
import torch
import os
import PIL
import PIL.Image
import pandas as pd
import numpy as np
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import torchvision.transforms as T
from transformers import T5ForConditionalGeneration, T5Tokenizer
from open_clip import create_model_from_pretrained, get_tokenizer 

In [4]:
# metadata csv load
txt_nlp_aug = pd.read_csv('/kaggle/input/curated-cxr-report-generation-dataset/NLP_aug_datasets/df_train_aug.csv')
txt_nlp_aug['path'] = txt_nlp_aug['path'].str.replace('../input/curated-cxr-report-generation-dataset', '/kaggle/input/curated-cxr-report-generation-dataset')

# using only 1,00 data for testing
test_df = txt_nlp_aug[:3000]

# preprocess function for image input for biomedclip
transform = T.Compose([
    T.Resize(size=224, interpolation=T.InterpolationMode.BICUBIC),
    T.CenterCrop(size=(224, 224)),
    T.ToTensor(),  # 먼저 Tensor로 변환
    T.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711])
])

In [5]:
# (1) Make custom dataset for training
class CustomDataset(Dataset):
    def __init__(self, df, transform):
        self.df = df
        self.transform = transform

    def __len__(self):
        return len(self.df)

    # get item (image: path value at df, text: raw text at df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx] 
        image_path = row['path']
        image = PIL.Image.open(image_path).convert('RGB') 
        image_resize = self.transform(image)
        text = row['text']
        return image_resize, text  

# Preprare dataset and dataloader
dataset = CustomDataset(test_df, transform)
data_loader = DataLoader(dataset, batch_size=8, shuffle=True)

In [6]:
dataset[0]

(tensor([[[-1.6463, -1.6317, -1.6317,  ..., -1.5879, -1.5879, -1.5879],
          [-1.6609, -1.6463, -1.6317,  ..., -1.5879, -1.5879, -1.5879],
          [-1.6609, -1.6609, -1.6463,  ..., -1.5879, -1.5879, -1.5879],
          ...,
          [-0.5076, -0.3178, -0.1718,  ...,  0.4705,  0.4559,  0.4413],
          [-0.4784, -0.2594, -0.1134,  ...,  0.4997,  0.4997,  0.4851],
          [-0.4200, -0.2302, -0.2156,  ...,  0.5581,  0.5581,  0.5435]],
 
         [[-1.6020, -1.5870, -1.5870,  ..., -1.5420, -1.5420, -1.5420],
          [-1.6170, -1.6020, -1.5870,  ..., -1.5420, -1.5420, -1.5420],
          [-1.6170, -1.6170, -1.6020,  ..., -1.5420, -1.5420, -1.5420],
          ...,
          [-0.4314, -0.2363, -0.0862,  ...,  0.5741,  0.5591,  0.5441],
          [-0.4014, -0.1763, -0.0262,  ...,  0.6041,  0.6041,  0.5891],
          [-0.3414, -0.1463, -0.1313,  ...,  0.6642,  0.6642,  0.6491]],
 
         [[-1.3380, -1.3238, -1.3238,  ..., -1.2811, -1.2811, -1.2811],
          [-1.3522, -1.3380,

In [7]:
class ClipT5TrainingModel(nn.Module):
    def __init__(self, clip_model, t5_model_name: str, projection_dim: int):
        super(ClipT5TrainingModel, self).__init__()
        self.clip_model = clip_model  
        self.t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_name)
        self.tokenizer = T5Tokenizer.from_pretrained(t5_model_name)
        
        clip_dim = 512  # CLIP 임베딩 차원
        t5_hidden_dim = self.t5_model.config.d_model  # T5 hidden dimension
        self.clip_project = nn.Linear(clip_dim, t5_hidden_dim)  # CLIP 투영을 T5 hidden dim으로

    def forward(self, images, captions, max_length: int = 50):
        # 이미지 처리
        stack_img = torch.stack(images).to(images[0].device)
        with torch.no_grad():
            image_features = self.clip_model.encode_image(stack_img).float()

        # 캡션 처리
        prompts = ["generate radiology report of this x-ray image: " + caption for caption in captions]
        input_ids = self.tokenizer(prompts, return_tensors="pt", padding=True, truncation=True,
                                   max_length=max_length).input_ids.to(stack_img.device)

        # CLIP 이미지 임베딩 투영
        projected_features = self.clip_project(image_features)
        projected_features = projected_features.unsqueeze(1)  # (batch_size, 1, hidden_dim)

        # T5 텍스트 임베딩과 결합
        inputs_embeds = self.t5_model.encoder.embed_tokens(input_ids)  # 텍스트 임베딩
        combined_embeds = torch.cat([projected_features, inputs_embeds], dim=1)  # 이미지 + 텍스트 임베딩

        # T5 모델 훈련
        labels = input_ids  # 정답 캡션
        outputs = self.t5_model(inputs_embeds=combined_embeds, labels=labels)
        
        return outputs.loss

In [8]:
# (3) Train Loop
def train(model, data_loader, optimizer, device, epochs=5):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for images, captions in tqdm(data_loader, desc=f"Epoch {epoch + 1}/{epochs}"):
            images = [img.to(device) for img in images]  # 모든 이미지를 CUDA로 이동
            captions = [caption for caption in captions]  # 캡션 리스트로 변환
            optimizer.zero_grad()
            loss = model(images, captions)  # 모델 호출

            # 역전파 및 최적화
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
        print(f"Epoch {epoch + 1}, Loss: {running_loss / len(data_loader)}")

In [9]:
# 모델 준비
device = "cuda" if torch.cuda.is_available() else "cpu"
biomedclip_model, _ = create_model_from_pretrained('hf-hub:microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224')
biomedclip_model.to(device)  # CLIP 모델을 CUDA로 이동
model = ClipT5TrainingModel(biomedclip_model, "t5-small", projection_dim=512).to(device)

# 옵티마이저 준비
optimizer = AdamW(model.parameters(), lr=5e-5)

# 훈련 실행
train(model, data_loader, optimizer, device, epochs=50)

open_clip_pytorch_model.bin:   0%|          | 0.00/784M [00:00<?, ?B/s]

open_clip_config.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Epoch 1/50: 100%|██████████| 375/375 [01:30<00:00,  4.13it/s]


Epoch 1, Loss: 0.07623564900333683


Epoch 2/50: 100%|██████████| 375/375 [01:03<00:00,  5.93it/s]


Epoch 2, Loss: 0.011073662080181141


Epoch 3/50: 100%|██████████| 375/375 [01:03<00:00,  5.93it/s]


Epoch 3, Loss: 0.004344890181906522


Epoch 4/50: 100%|██████████| 375/375 [01:03<00:00,  5.87it/s]


Epoch 4, Loss: 0.0030970600159683577


Epoch 5/50: 100%|██████████| 375/375 [01:04<00:00,  5.80it/s]


Epoch 5, Loss: 0.003201987090986222


Epoch 6/50: 100%|██████████| 375/375 [01:02<00:00,  6.01it/s]


Epoch 6, Loss: 0.0020166627276533592


Epoch 7/50: 100%|██████████| 375/375 [01:01<00:00,  6.14it/s]


Epoch 7, Loss: 0.0017811195037017267


Epoch 8/50: 100%|██████████| 375/375 [01:01<00:00,  6.11it/s]


Epoch 8, Loss: 0.0018971930705398942


Epoch 9/50: 100%|██████████| 375/375 [01:01<00:00,  6.06it/s]


Epoch 9, Loss: 0.001510621333339562


Epoch 10/50: 100%|██████████| 375/375 [01:01<00:00,  6.06it/s]


Epoch 10, Loss: 0.0010450083407728622


Epoch 11/50: 100%|██████████| 375/375 [01:02<00:00,  6.03it/s]


Epoch 11, Loss: 0.0010740447730834906


Epoch 12/50: 100%|██████████| 375/375 [01:02<00:00,  6.05it/s]


Epoch 12, Loss: 0.0011914646395404513


Epoch 13/50: 100%|██████████| 375/375 [01:02<00:00,  6.04it/s]


Epoch 13, Loss: 0.0012322769651267056


Epoch 14/50: 100%|██████████| 375/375 [01:02<00:00,  6.02it/s]


Epoch 14, Loss: 0.0009767162930996469


Epoch 15/50: 100%|██████████| 375/375 [01:01<00:00,  6.06it/s]


Epoch 15, Loss: 0.0009476449826033786


Epoch 16/50: 100%|██████████| 375/375 [01:02<00:00,  6.02it/s]


Epoch 16, Loss: 0.0006085409397880236


Epoch 17/50: 100%|██████████| 375/375 [01:02<00:00,  6.01it/s]


Epoch 17, Loss: 0.0009581193085177801


Epoch 18/50: 100%|██████████| 375/375 [01:02<00:00,  6.05it/s]


Epoch 18, Loss: 0.0010651686613370355


Epoch 19/50: 100%|██████████| 375/375 [01:01<00:00,  6.08it/s]


Epoch 19, Loss: 0.0008050297779846005


Epoch 20/50: 100%|██████████| 375/375 [01:02<00:00,  6.00it/s]


Epoch 20, Loss: 0.0006756080372530656


Epoch 21/50: 100%|██████████| 375/375 [01:01<00:00,  6.07it/s]


Epoch 21, Loss: 0.00042227669235823365


Epoch 22/50: 100%|██████████| 375/375 [01:02<00:00,  6.04it/s]


Epoch 22, Loss: 0.0004958872680920953


Epoch 23/50: 100%|██████████| 375/375 [01:02<00:00,  6.03it/s]


Epoch 23, Loss: 0.0006432849234940174


Epoch 24/50: 100%|██████████| 375/375 [01:02<00:00,  5.99it/s]


Epoch 24, Loss: 0.0006140886475138055


Epoch 25/50: 100%|██████████| 375/375 [01:02<00:00,  6.04it/s]


Epoch 25, Loss: 0.000910866222557767


Epoch 26/50: 100%|██████████| 375/375 [01:02<00:00,  6.02it/s]


Epoch 26, Loss: 0.00042764658464390475


Epoch 27/50: 100%|██████████| 375/375 [01:02<00:00,  5.98it/s]


Epoch 27, Loss: 0.00035562086915403294


Epoch 28/50: 100%|██████████| 375/375 [01:02<00:00,  6.04it/s]


Epoch 28, Loss: 0.00040563988883513956


Epoch 29/50: 100%|██████████| 375/375 [01:01<00:00,  6.07it/s]


Epoch 29, Loss: 0.0003884025572236472


Epoch 30/50: 100%|██████████| 375/375 [01:02<00:00,  6.00it/s]


Epoch 30, Loss: 0.0005895999334121977


Epoch 31/50: 100%|██████████| 375/375 [01:02<00:00,  6.02it/s]


Epoch 31, Loss: 0.00035376807408950603


Epoch 32/50: 100%|██████████| 375/375 [01:02<00:00,  6.02it/s]


Epoch 32, Loss: 0.0005904601460885412


Epoch 33/50: 100%|██████████| 375/375 [01:02<00:00,  5.99it/s]


Epoch 33, Loss: 0.0004958862606533027


Epoch 34/50: 100%|██████████| 375/375 [01:02<00:00,  6.03it/s]


Epoch 34, Loss: 0.00026629159781926625


Epoch 35/50: 100%|██████████| 375/375 [01:02<00:00,  5.98it/s]


Epoch 35, Loss: 0.0004608080693190762


Epoch 36/50: 100%|██████████| 375/375 [01:02<00:00,  6.03it/s]


Epoch 36, Loss: 0.00029367714758457927


Epoch 37/50: 100%|██████████| 375/375 [01:02<00:00,  6.03it/s]


Epoch 37, Loss: 0.000350788979868715


Epoch 38/50: 100%|██████████| 375/375 [01:02<00:00,  5.96it/s]


Epoch 38, Loss: 0.0003388196797168348


Epoch 39/50: 100%|██████████| 375/375 [01:02<00:00,  6.00it/s]


Epoch 39, Loss: 0.0004056218718906166


Epoch 40/50: 100%|██████████| 375/375 [01:02<00:00,  6.02it/s]


Epoch 40, Loss: 0.0004287791284587001


Epoch 41/50: 100%|██████████| 375/375 [01:02<00:00,  6.01it/s]


Epoch 41, Loss: 0.0004947327523307952


Epoch 42/50: 100%|██████████| 375/375 [01:02<00:00,  6.04it/s]


Epoch 42, Loss: 0.00025101499841063437


Epoch 43/50: 100%|██████████| 375/375 [01:01<00:00,  6.11it/s]


Epoch 43, Loss: 0.0006269581129211777


Epoch 44/50: 100%|██████████| 375/375 [01:01<00:00,  6.06it/s]


Epoch 44, Loss: 0.00023220389756412865


Epoch 45/50: 100%|██████████| 375/375 [01:02<00:00,  5.99it/s]


Epoch 45, Loss: 0.0003180129688359254


Epoch 46/50: 100%|██████████| 375/375 [01:02<00:00,  5.96it/s]


Epoch 46, Loss: 0.0002788419167336542


Epoch 47/50: 100%|██████████| 375/375 [01:02<00:00,  6.02it/s]


Epoch 47, Loss: 0.00021549875031632836


Epoch 48/50: 100%|██████████| 375/375 [01:02<00:00,  6.02it/s]


Epoch 48, Loss: 0.0002910550093608132


Epoch 49/50: 100%|██████████| 375/375 [01:02<00:00,  6.00it/s]


Epoch 49, Loss: 0.000296667076011848


Epoch 50/50: 100%|██████████| 375/375 [01:01<00:00,  6.06it/s]

Epoch 50, Loss: 0.00030494451534953743





In [10]:
# test
def generate_captions(model, data_loader, device):
    model.eval()  # 평가 모드로 전환
    all_generated_captions = []
    all_images = []

    with torch.no_grad():
        for images, _ in tqdm(data_loader, desc="Generating Captions"):
            images = [img.to(device) for img in images]  # 이미지를 CUDA로 이동

            # 이미지 임베딩 생성
            stack_img = torch.stack(images).to(device)  
            image_features = model.clip_model.encode_image(stack_img).float()  # 이미지 인코딩
            
            # 이미지 임베딩을 T5 모델 입력 차원에 맞게 투영
            projected_features = model.clip_project(image_features)

            # 텍스트 캡션 생성
            # T5 모델의 디코더를 통해 캡션 생성
            max_length = 50  # 최대 캡션 길이
            generated_ids = model.t5_model.generate(
                inputs_embeds=projected_features.unsqueeze(1),
                max_length=max_length,
                num_beams=5,
                early_stopping=True
            )

            # 생성된 ID를 텍스트로 변환
            generated_captions = model.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
            all_generated_captions.extend(generated_captions)
            all_images.extend(images)

    return all_generated_captions, all_images

In [11]:
# 테스트 데이터셋 준비
real_test = txt_nlp_aug[1001:1005]
test_dataset = CustomDataset(real_test, transform)
test_data_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# 캡션 생성
generated_captions, images = generate_captions(model, test_data_loader, device)

# 결과 출력
for caption in generated_captions:
    print(f'생성된 캡션: {caption}')

Generating Captions: 100%|██████████| 1/1 [00:00<00:00,  1.10it/s]

생성된 캡션: .
생성된 캡션: 
생성된 캡션: dir dir dir dir dir dir dir dir dir dir dir dir dir dir dir dir dir dir dir dir dir dir dir dir dir dir dir dir dir dir dir dir dir dir dir dir dir dir dir dir dir
생성된 캡션: ski ski ski ski ski ski ski ski ski ski ski ski ski ski ski ski ski ski ski ski ski ski ski ski ski ski ski ski ski ski ski ski ski ski ski ski ski ski ski ski ski ski ski ski ski ski ski ski ski



