<a href="https://colab.research.google.com/github/Jellyjellyjinjin/Daycon-multimodal/blob/main/%5BBaseline%5D_Vit_%2B_Visual_Bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content

!unzip -qq "/content/drive/MyDrive/multi.zip"

/content


In [None]:
from glob import glob
train_file = list(glob('/content/image/train/*.jpg'))
test_file = list(glob('/content/image/test/*.jpg'))

print(len(train_file), len(test_file))


107231 11915


In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m49.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m107.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m68.5 MB/s[0m eta [36m0:00:

## Import

In [None]:
import os
import pandas as pd

import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import torchvision.models as models
from torchvision import transforms
from PIL import Image
from transformers import AutoModel,ViTModel,ViTFeatureExtractor

from transformers import BertTokenizer, VisualBertModel

from tqdm.auto import tqdm

## Dataset

In [None]:
class VQADataset(Dataset):
    def __init__(self, df, tokenizer, transform, img_path, is_test=False):
        self.df = df
        self.tokenizer = tokenizer
        self.transform = transform
        self.img_path = img_path
        self.is_test = is_test

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        img_name = os.path.join(self.img_path, row['image_id'] + '.jpg') # 이미지
        image = Image.open(img_name).convert('RGB')
        image = self.transform(image)

        question = row['question'] # 질문
        question = self.tokenizer.encode_plus(
            question,
            truncation=True,
            add_special_tokens=True,
            max_length=32,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )

        if not self.is_test:
            answer = row['answer'] # 답변
            answer = self.tokenizer.encode_plus(
                answer,
                max_length=32,
                padding='max_length',
                truncation=True,
                return_tensors='pt')
            return {
                'image': image.squeeze(),
                'question': question['input_ids'].squeeze(),
                'answer': answer['input_ids'].squeeze()
            }
        else:
            return {
                'image': image,
                'question': question['input_ids'].squeeze(),
            }

## Model

In [None]:
class VQAModel(nn.Module):
    def __init__(self, vocab_size):
        super(VQAModel, self).__init__()
        self.vocab_size = vocab_size

        self.ViT = ViTModel.from_pretrained('google/vit-base-patch32-224-in21k')

        self.bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        self.visualbert = VisualBertModel.from_pretrained('uclanlp/visualbert-vqa-coco-pre')


        combined_features_size = 768 + self.visualbert.config.hidden_size # resnet 출력 차원 + gpt2 출력 차원
        self.classifier = nn.Linear(combined_features_size, vocab_size)

    def forward(self, images, question):
        image_features = self.ViT(images)
        pooler_output = image_features.pooler_output
        image_features = pooler_output.view(pooler_output.size(0), -1)

        outputs = self.visualbert(question)
        output_features = outputs.last_hidden_state # [batch, sequence, hidden]

        image_features = image_features.unsqueeze(1).expand(-1, output_features.size(1),-1) # [batch, sequence, 1000]

        combined = torch.cat([image_features, output_features], dim=-1) # [batch, sequence, 1000+hidden]
        output = self.classifier(combined) # [batch, vocab_size]
        return output

## DataLoader

In [None]:
# 데이터 불러오기
train_df = pd.read_csv('train.csv')

test_df = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')
train_img_path = 'image/train'
test_img_path = 'image/test'

# dataset & dataloader
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
vocab_size = len(tokenizer)

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

train_dataset = VQADataset(train_df, tokenizer, transform, train_img_path, is_test=False)
train_loader = DataLoader(train_dataset, batch_size= 32, shuffle=True)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

## Train & Inference

In [None]:
def train(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0

    for data in tqdm(loader, total=len(loader)):
        images = data['image'].to(device)
        question = data['question'].to(device)
        answer = data['answer'].to(device)

        optimizer.zero_grad()

        outputs = model(images, question)

        # output: [batch, sequence, vocab], answer : [batch, sequence]
        loss = criterion(outputs.view(-1, outputs.size(-1)), answer.view(-1))
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(loader)
    return avg_loss

In [None]:
def inference(model, loader):
    model.eval()
    preds = []
    with torch.no_grad():
        for data in tqdm(loader, total=len(loader)):
            images = data['image'].to(device)
            question = data['question'].to(device)

            outputs = model(images, question) # [batch, sequence, vocab]

            _, pred = torch.max(outputs, dim=2) # values, indices = _, pred
            preds.extend(pred.cpu().numpy())

    return preds

## Run!

In [None]:
# device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f"current device is {device}")


# Model
model = VQAModel(vocab_size).to(device)

# Criterion and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=5e-5)

# Training loop
for epoch in range(3):
    avg_loss = train(model, train_loader, optimizer, criterion)
    print(f"Epoch: {epoch+1}, Loss: {avg_loss:.4f}")

current device is cuda


Downloading (…)lve/main/config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/352M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/631 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/448M [00:00<?, ?B/s]

  0%|          | 0/11236 [00:00<?, ?it/s]

Epoch: 1, Loss: 0.1671


  0%|          | 0/11236 [00:00<?, ?it/s]

Epoch: 2, Loss: 0.1171


  0%|          | 0/11236 [00:00<?, ?it/s]

Epoch: 3, Loss: 0.1020


## Post-Processing

In [None]:
# Dataset & DataLoader
test_dataset = VQADataset(test_df, tokenizer, transform, test_img_path, is_test=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# inference
preds = inference(model, test_loader)



  0%|          | 0/1265 [00:00<?, ?it/s]

In [None]:
torch.save(model.state_dict(), 'model_weights.pt')

In [None]:
no_pad_output = []
for pred in preds:
    output = pred[pred != 50257] # [PAD] token 제외
    no_pad_output.append(tokenizer.decode(output).strip()) # 토큰 id -> 토큰

In [None]:
no_pad_output

['[CLS] no [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 '[CLS] beach [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 '[CLS] red [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 '[CLS] brown [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 '[CLS] no [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]',
 '[CLS] blue [SEP] [PAD] [PA

In [None]:
no_pad_outputs = []
for output in no_pad_output:
  no_pad_outputs.append(output.split()[1])

no_pad_outputs

['no',
 'beach',
 'red',
 'brown',
 'no',
 'blue',
 '2',
 'yes',
 'yes',
 'blue',
 'no',
 'bathroom',
 'no',
 '0',
 'white',
 '1',
 'flowers',
 'yellow',
 'yes',
 '2',
 'yes',
 'yes',
 'yes',
 'no',
 '0',
 'dirt',
 'yes',
 'green',
 'yes',
 'lamp',
 '2',
 'train',
 '0',
 'to',
 '1',
 'no',
 'blue',
 'night',
 'to',
 '2',
 'man',
 'flying',
 'yes',
 '1',
 'tie',
 '1',
 'no',
 'chinese',
 'left',
 'skateboarding',
 'yes',
 'white',
 'yes',
 'yes',
 'people',
 'no',
 'white',
 'yes',
 'breakfast',
 'fork',
 'lab',
 'no',
 'bed',
 'yes',
 'blue',
 'blue',
 'gas',
 'left',
 'no',
 'baseball',
 'no',
 'pink',
 'no',
 'yes',
 'bus',
 'blue',
 'yes',
 'apartment',
 'lunch',
 'left',
 'no',
 'no',
 'pink',
 'kitchen',
 'in',
 'cat',
 '1',
 'cake',
 'toilet',
 'man',
 'green',
 'bed',
 'w',
 'snow',
 'no',
 'birthday',
 'red',
 'yes',
 'yes',
 'a',
 '4',
 'background',
 'no',
 'lettuce',
 'no',
 '2',
 'yes',
 'chocolate',
 'winter',
 'long',
 'yes',
 'jeans',
 'no',
 'yes',
 'no',
 '2',
 '4',
 '

## Submission

In [None]:
sample_submission['answer'] = no_pad_outputs
sample_submission.to_csv('submission.csv', index=False)