# Load Dataset

In [2]:
import os

In [1]:
def load_data(data_path):
    data = []

    with open(data_path, "r") as f:
        lines = f.readlines()

        for line in lines:
            part = line.strip().split("\t")
            image_id = part[0]
            image_id = image_id.split("#")
            image_id = image_id[0]
            remaining_part = part[1]
            # Tách  (remaining_part) thành câu hỏi và câu trả lời
            QA = remaining_part.strip().split("?", 1)
            question = QA[0]
            answer = QA[1]
            answer = answer.strip()
            data_sample = {
                'question': QA[0] + '?',
                'image_path': image_id,
                'answer': answer
            }
            data.append(data_sample)

    return data

In [4]:
train_path = os.path.join("..", "data_coco", "vaq2.0.TrainImages.txt")
val_path = os.path.join("..", "data_coco", "vaq2.0.DevImages.txt")
test_path = os.path.join("..", "data_coco", "vaq2.0.TestImages.txt")

train_data=load_data(train_path)
val_data=load_data(val_path)
test_data=load_data(test_path)


In [5]:
print(train_data[0])

{'question': 'Is this a creamy soup ?', 'image_path': 'COCO_val2014_000000393225.jpg', 'answer': 'no'}


# Data Process

In [6]:
import spacy
from torchtext.vocab import build_vocab_from_iterator


In [7]:
nlp = spacy.load("en_core_web_sm")


def get_token(data_iters):
    for sample in data_iters:
        question = sample['question']
        yield [token.text for token in nlp.tokenizer(question)]


vocab = build_vocab_from_iterator(
    get_token(train_data),
    min_freq=1,
    specials=["<unk>", "sos", "eos", "<pad>"],
    special_first=True
)
vocab.set_default_index(vocab["<unk>"])#Nếu một token.text không có trong vocab, nó sẽ được gán ID của <unk>

In [8]:
#Độ dài cố định để xử lý theo batch
def tokenize(question_text, max_seq_len= 20):
    spacy_tokens = nlp.tokenizer(question_text)
    #  Chuyển các token thành ID số sử dụng vocab
    # Chúng ta cần lấy .text của mỗi  Token để tra cứu trong vocab
    numerical_sequence = [vocab[token.text] for token in spacy_tokens]
    current_len = len(numerical_sequence)
    if current_len < max_seq_len:
        padding_needed = max_seq_len - current_len
        numerical_sequence += [vocab['<pad>']] * padding_needed
    elif current_len > max_seq_len:
        numerical_sequence = numerical_sequence[:max_seq_len]
    return numerical_sequence

In [9]:
example = "Hello World!"
print(tokenize(example))

[0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]


In [10]:
classes = set([sample['answer'] for sample in train_data])
classes_to_idx = {
    cls_name: idx for idx, cls_name in enumerate(classes)
}
idx_to_classes = {
    idx: cls_name for idx, cls_name in enumerate(classes)
}
print(idx_to_classes)

{0: 'yes', 1: '? yes', 2: 'no', 3: '? no', 4: '" ? no'}


# Pytorch Dataset

In [18]:
from torch.utils.data import Dataset
from PIL import Image
import torchvision.transforms as transforms

In [12]:
class VQADataset(Dataset):
    def __init__(
        self,
        data,
        classes_to_idx,
        max_seq_len=20,       # Độ dài tối đa của chuỗi câu hỏi (mặc định là 20)
        transform=None,
        root_dir='/data_coco/val2014-resized/'
    ):
        self.transform = transform
        self.data = data
        self.max_seq_len = max_seq_len
        self.root_dir = root_dir
        self.classes_to_idx = classes_to_idx
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        img_path = os.path.join(self.root_dir, self.data[idx]['image_path'])
        img = Image.open(img_path).convert("RGB")
        if self.tranform:
            img = self.tranform(img)
        question = self.data[idx]['question']
        question = tokenize(question, self.max_seq_len)
        question = torch.tensor(question, dtype = torch.long)

        answer = self.data[idx]['answer']
        answer = self.classes_to_idx[answer]
        answer = torch.tensor(answer, dtype = torch.long)

# DataLoader

In [24]:
transform = transforms.Compose(
    [
        transforms.Resize((224,224)),
        transforms.ToTensor(),
        transforms.Normalize(
                    (0.485, 0.456, 0.406),
                    (0.229, 0.224, 0.225)
        ),
    ]
)

In [25]:
train_dataset = VQADataset(train_data, classes_to_idx, transform=transform)
val_dataset = VQADataset(val_data, classes_to_idx, transform=transform)
test_dataset = VQADataset(test_data, classes_to_idx, transform=transform)