# Load Dataset

In [2]:
import os

from spacy.symbols import nn
from sympy.printing.pytorch import torch

In [65]:
#COCO_val2014_000000393225.jpg#0	Is this a creamy soup ? no
def load_data(data_path):
    data = []
    path = data_path

    with open(path, "r") as f:
        lines = f.readlines()

        for line in lines:
            tmp = line.split("\t")
            QA = tmp[1].rsplit("?",1)
            answer = QA[1].strip()

            data_sample = {
                'question': QA[0] + '?',
                'image_path': tmp[0][:-2], # Bỏ #0
                'answer': answer
            }

            data.append(data_sample)

    return data

In [66]:
train_path = os.path.join("..", "data_coco", "vaq2.0.TrainImages.txt")
val_path = os.path.join("..", "data_coco", "vaq2.0.DevImages.txt")
test_path = os.path.join("..", "data_coco", "vaq2.0.TestImages.txt")

train_data=load_data(train_path)
val_data=load_data(val_path)
test_data=load_data(test_path)


In [55]:
print(train_data[0])

{'question': 'Is this a creamy soup ?', 'image_path': 'COCO_val2014_000000393225.jpg', 'answer': 'no'}


# Data Process

In [67]:
import spacy
from torchtext.vocab import build_vocab_from_iterator


In [68]:
nlp = spacy.load("en_core_web_sm")


def get_token(data_iters):
    for sample in data_iters:
        question = sample['question']
        yield [token.text for token in nlp.tokenizer(question)]


vocab = build_vocab_from_iterator(
    get_token(train_data),
    min_freq=2,
    specials=["<unk>", "sos", "eos", "<pad>"],
    special_first=True
)
vocab.set_default_index(vocab["<unk>"])#Nếu một token.text không có trong vocab, nó sẽ được gán ID của <unk>

In [69]:
#Độ dài cố định để xử lý theo batch
def tokenize(question_text, max_seq_len= 20):
    spacy_tokens = nlp.tokenizer(question_text)
    #  Chuyển các token thành ID số sử dụng vocab
    # Chúng ta cần lấy .text của mỗi  Token để tra cứu trong vocab
    numerical_sequence = [vocab[token.text] for token in spacy_tokens]
    current_len = len(numerical_sequence)
    if current_len < max_seq_len:
        padding_needed = max_seq_len - current_len
        numerical_sequence += [vocab['<pad>']] * padding_needed
    elif current_len > max_seq_len:
        numerical_sequence = numerical_sequence[:max_seq_len]
    return numerical_sequence

In [70]:
example = "Hello World!"
print(tokenize(example))
print(len(vocab))

[0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
1678


In [71]:
classes = set([sample['answer'] for sample in train_data])
classes_to_idx = {
    cls_name: idx for idx, cls_name in enumerate(classes)
}
idx_to_classes = {
    idx: cls_name for idx, cls_name in enumerate(classes)
}
print(idx_to_classes)

{0: 'no', 1: 'yes'}


In [72]:
print(classes)

{'no', 'yes'}


# Pytorch Dataset

In [73]:
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import torchvision.transforms as transforms

In [237]:
class VQADataset(Dataset):
    def __init__(
        self,
        data,
        classes_to_idx,
        max_seq_len=20,       # Độ dài tối đa của chuỗi câu hỏi (mặc định là 20)
        transform=None,
        root_dir="../data_coco/val2014-resised/"
    ):
        self.transform = transform
        self.data = data
        self.max_seq_len = max_seq_len
        self.root_dir = root_dir
        self.classes_to_idx = classes_to_idx
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        img_path = os.path.join(self.root_dir, self.data[idx]['image_path'])
        img = Image.open(img_path).convert("RGB")
        if self.transform:
            img = self.transform(img)
        question = self.data[idx]['question']
        question = tokenize(question, self.max_seq_len)
        question = torch.tensor(question, dtype = torch.long)

        label = self.data[idx]['answer']
        label = self.classes_to_idx[label]
        label = torch.tensor(label, dtype = torch.long)
        return img , question, label


# DataLoader

In [216]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])


In [238]:
train_dataset = VQADataset(train_data, classes_to_idx, transform=transform)
val_dataset = VQADataset(val_data, classes_to_idx, transform=transform)
test_dataset = VQADataset(test_data, classes_to_idx, transform=transform)

In [239]:
train_batch_size = 128
test_batch_size = 32

train_loader = DataLoader(train_dataset, batch_size= train_batch_size, shuffle = True)
val_loader = DataLoader(val_dataset, batch_size= test_batch_size, shuffle = False)
test_loader = DataLoader(test_dataset, batch_size= test_batch_size, shuffle = False)

# Model

In [219]:
import torch
import torch.nn as nn
import timm


In [257]:
class VQAModel(nn.Module):
    def __init__(
            self,
            n_classes,
            embedding_dim = 64,
            n_layer = 2,
            hidden_size = 256,
            dropout = 0.2,
    ):
            super(VQAModel, self).__init__()
            self.image_encoder = timm.create_model(
                'resnet50',
                pretrained = True,
                num_classes = hidden_size,
            )
            self.image_fc = nn.Linear(hidden_size, 1024)

            self.embedding = nn.Embedding(len(vocab), embedding_dim)
            self.text_encoder = nn.LSTM(
                embedding_dim,
                hidden_size,
                num_layers = n_layer,
                bidirectional = True,
                batch_first = True,
            )
            self.text_fc = nn.Linear(hidden_size*2, 1024 )

            self.fc1 = nn.Linear(1024, 1000)
            self.relu = nn.ReLU()
            self.dropout = nn.Dropout(dropout)
            self.fc2 = nn.Linear(1000, n_classes)
    def forward(self, image, text):
        img_features = self.image_encoder(image)
        img_features = self.image_fc(img_features)

        text_emb = self.embedding(text)
        text_features,_ = self.text_encoder(text_emb)

        #Take the last hidden state from the sequence
        text_features = text_features[:, -1, :]  #[batch_size, hidden_size*2]
        text_features = self.text_fc(text_features)

        x = img_features * text_features
        x= self.fc1(x)
        x= self.relu(x)
        x= self.dropout(x)
        x= self.fc2(x)

        return x


In [259]:
n_classes = len(classes)
img_model_name = "resnet50"
hidden_size = 256
n_layer = 2
embedding_dim = 64
dropout = 0.2
device = "cuda" if torch.cuda.is_available() else "cpu"

model=VQAModel(
    n_classes=n_classes,
    embedding_dim=embedding_dim,
    n_layer=n_layer,
    hidden_size=hidden_size,
    dropout=dropout
).to(device)


RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


# Evaluate

In [253]:
def evaluate(model, dataloader, criterion, device):
    model.eval()
    correct = 0
    total = 0
    losses = []

    with torch.no_grad():
        for image, question, labels in dataloader:
            image = image.to(device)
            question = question.to(device)
            labels = labels.to(device)
            outputs = model(image, question)
            loss = criterion(outputs, labels)
            losses.append(loss.item())

            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    loss = sum(losses) / len(losses)
    acc = correct / total

    return loss, acc

# Training

In [254]:
def fit(
            model,
            train_loader,
            val_loader,
            criterion,
            optimizer,
            scheduler,
            device,
            epochs
        ):
            train_losses = []
            val_losses = []
            train_accs = []
            val_accs = []

            for epoch in range(epochs):
                batch_train_losses = []

                train_correct = 0
                train_total = 0

                model.train()
                for idx, (images, questions, labels) in enumerate(train_loader):
                    images = images.to(device)
                    questions = questions.to(device)
                    labels = labels.to(device)

                    optimizer.zero_grad()
                    outputs = model(images, questions)
                    loss = criterion(outputs, labels)
                    _, predicted = torch.max(outputs.data, 1)
                    train_total += labels.size(0)
                    train_correct += (predicted == labels).sum().item()
                    loss.backward()
                    optimizer.step()

                    batch_train_losses.append(loss.item())

                train_loss = sum(batch_train_losses) / len(batch_train_losses)
                train_losses.append(train_loss)

                train_acc = train_correct / train_total
                train_accs.append(train_acc)

                val_loss, val_acc = evaluate(
                    model, val_loader,
                    criterion,
                    device
                )
                val_losses.append(val_loss)
                val_accs.append(val_acc)

                print(f'EPOCH {epoch + 1}:\tTrain loss: {train_loss:.4f}\tTrain acc: {train_acc:.4f}\tVal loss: {val_loss:.4f}\tVal acc: {val_acc:.4f}')
                scheduler.step()

            return train_losses, val_losses, train_accs, val_accs

In [255]:
lr = 1e-2
epochs = 50

scheduler_step_size = epochs * 0.6
criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(
    model.parameters(),
    lr=lr
)
scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer,
    step_size=scheduler_step_size,
    gamma=0.1
)

In [256]:
train_losses, val_losses,train_accs , val_accs = fit(
    model,
    train_loader,
    val_loader,
    criterion,
    optimizer,
    scheduler,
    device,
    epochs
)

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
