# Import libraries

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

import torchvision.models as models
import torchvision.transforms as transforms

import timm

from sklearn.model_selection import train_test_split

import nltk
from nltk.tokenize import word_tokenize

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from wordcloud import WordCloud

import os
import time
import warnings
import random
import string
from collections import Counter
from PIL import Image


sns.set_context("paper")
sns.set_style("whitegrid")

warnings.filterwarnings('ignore')
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


# Data preparing

In [3]:
ROOT = '/kaggle/input/food-vqa/VN20-500_v3'
# ROOT = '/kaggle/input/hand-image-vqa'


train_df = pd.read_csv(f'{ROOT}/annotations/train.csv')
val_df = pd.read_csv(f'{ROOT}/annotations/validation.csv')
test_df = pd.read_csv(f'{ROOT}/annotations/test.csv')


data =  pd.concat([train_df, val_df, test_df], ignore_index=True)
data.head()

Unnamed: 0,question,answer,question_type,answer_type,image_path
0,m√≥n n√†y l√† m√≥n g√¨,b√°nh b√®o,recognition,text,train/Banh_Beo/1.jpg
1,ƒë√¢y c√≥ ph·∫£i l√† b√°nh b√®o kh√¥ng,c√≥,yes/no,boolean,train/Banh_Beo/1.jpg
2,m√†u ch·ªß ƒë·∫°o c·ªßa b√°nh b√®o l√† g√¨,tr·∫Øng,color,text,train/Banh_Beo/1.jpg
3,b√°nh b√®o c√≥ m√†u g√¨,tr·∫Øng,color,text,train/Banh_Beo/1.jpg
4,b√°nh b√®o c√≥ n∆∞·ªõc ch·∫•m kh√¥ng,c√≥,yes/no,boolean,train/Banh_Beo/1.jpg


In [4]:
def clean_text(text):
    text = text.replace('?', '').replace(',', '').lower().split(' ')
    return '_'.join(text)
    
classes = {
    'b√°nh b√®o': 'b√°nh_b√®o',
    'b√°nh cƒÉn': 'b√°nh_cƒÉn',
    'b√°nh gi√≤': 'b√°nh_gi√≤',
    'b√°nh m√¨': 'b√°nh_m√¨',
    'b√°nh tr√°ng n∆∞·ªõng': 'b√°nh_tr√°ng_n∆∞·ªõng',
    'b√°nh x√®o': 'b√°nh_x√®o',
    'b·∫Øp x√†o': 'b·∫Øp_x√†o',
    'b√∫n b√≤': 'b√∫n_b√≤',
    'b√∫n ch·∫£': 'b√∫n_ch·∫£',
    'b√∫n ƒë·∫≠u': 'b√∫n_ƒë·∫≠u',
    'b√∫n m·∫Øm': 'b√∫n_m·∫Øm',
    'b√∫n th·ªãt n∆∞·ªõng': 'b√∫n_th·ªãt_n∆∞·ªõng',
    'cao l·∫ßu': 'cao_l·∫ßu',
    'ch√°o l√≤ng': 'ch√°o_l√≤ng',
    'c∆°m t·∫•m': 'c∆°m_t·∫•m',
    'g·ªèi cu·ªën': 'g·ªèi_cu·ªën',
    'h·ªß ti·∫øu': 'h·ªß_ti·∫øu',
    'm√¨ qu·∫£ng': 'm√¨_qu·∫£ng',
    'ph√° l·∫•u': 'ph√°_l·∫•u',
    'ph·ªü': 'ph·ªü',
    'ch·ªß ƒë·∫°o': 'ch·ªß_ƒë·∫°o',
    'n∆∞·ªõc ch·∫•m': 'n∆∞·ªõc_ch·∫•m',
    'm√†u s·∫Øc': 'm√†u_s·∫Øc',
    
    }
print(f"Train samples: {len(train_df)}, Val samples: {len(val_df)}, Test samples: {len(test_df)}")

Train samples: 41613, Val samples: 5518, Test samples: 10530


In [5]:
data['answer'] = data.answer.apply(clean_text)
train_df['answer'] = train_df.answer.apply(clean_text)
test_df['answer'] = test_df.answer.apply(clean_text)
val_df['answer'] = val_df.answer.apply(clean_text)

In [6]:
train_df.head()

Unnamed: 0,question,answer,question_type,answer_type,image_path
0,m√≥n n√†y l√† m√≥n g√¨,b√°nh_b√®o,recognition,text,train/Banh_Beo/1.jpg
1,ƒë√¢y c√≥ ph·∫£i l√† b√°nh b√®o kh√¥ng,c√≥,yes/no,boolean,train/Banh_Beo/1.jpg
2,m√†u ch·ªß ƒë·∫°o c·ªßa b√°nh b√®o l√† g√¨,tr·∫Øng,color,text,train/Banh_Beo/1.jpg
3,b√°nh b√®o c√≥ m√†u g√¨,tr·∫Øng,color,text,train/Banh_Beo/1.jpg
4,b√°nh b√®o c√≥ n∆∞·ªõc ch·∫•m kh√¥ng,c√≥,yes/no,boolean,train/Banh_Beo/1.jpg


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57661 entries, 0 to 57660
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   question       57661 non-null  object
 1   answer         57661 non-null  object
 2   question_type  57661 non-null  object
 3   answer_type    57661 non-null  object
 4   image_path     57661 non-null  object
dtypes: object(5)
memory usage: 2.2+ MB


In [8]:
data.describe()

Unnamed: 0,question,answer,question_type,answer_type,image_path
count,57661,57661,57661,57661,57661
unique,3135,27,3,2,9720
top,m√†u s·∫Øc ch√≠nh c·ªßa m√≥n ƒÉn l√† g√¨,c√≥,yes/no,text,train/Banh_Beo/10.jpg
freq,2130,21860,23837,32797,14


In [9]:
questions = data.question
answers = data.answer


len(questions), len(answers)

(57661, 57661)

In [10]:
class Vocab():
    def __init__(self, texts, pad_token='<pad>', unk_token='<unk>', pad_idx=0, unk_idx=1):
        self.PAD_TOKEN = pad_token
        self.UNK_TOKEN = unk_token
        self.PAD_IDX = pad_idx
        self.UNK_IDX = unk_idx
        
        self.vocab = self._build_vocab(texts)
        self.vocab2idx = {word: idx for idx, word in enumerate(self.vocab)}
        self.size = len(self.vocab)

    def word2idx(self, vocab):
        if vocab in self.vocab2idx:
            return self.vocab2idx[vocab]
        else:
            return self.vocab2idx[self.UNK_TOKEN]

    def idx2word(self, idx):
        return self.vocab[idx]

    def _build_vocab(self, texts, min_freq=5):
        counter = Counter()
        for text in texts:
            tokens = word_tokenize(text.lower())
            counter.update(tokens)
    
        vocab = [self.PAD_TOKEN, self.UNK_TOKEN]  
        for word, freq in counter.items():
            if freq >= min_freq: 
                vocab.append(word)
                
        return vocab

In [11]:
class VQADataset(Dataset):
    def __init__(self, data, q_vocab, a_vocab, transform=None, max_len=30, image_folder=f'{ROOT}/images'):
        self.data = data
        self.question_vocab = q_vocab
        self.answer_vocab = a_vocab
        # self.question_vocab = Vocab(self.data.question)
        # self.answer_vocab = Vocab(self.data.answer)
        self.transform = transform 
        
        self.image_folder = image_folder
        self.MAX_QUESTION_LEN = data.question.apply(lambda x: len(x.split(' '))).max()
        self.MAX_ANSWER_LEN = data.answer.apply(lambda x: len(x.split(' '))).max()
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        
        question = row['question']
        answer = row['answer']
        image_id = row['image_path']

        question_tokens = word_tokenize(question)
        question_indices = [self.question_vocab.word2idx(token) for token in question_tokens]
        question_indices = question_indices[:self.MAX_QUESTION_LEN]  # C·∫Øt n·∫øu qu√° d√†i
        question_indices += [self.question_vocab.PAD_IDX] * (self.MAX_QUESTION_LEN - len(question_indices))  # Padding n·∫øu qu√° ng·∫Øn

        # answer_token = word_tokenize(answer)  
        answer_idx = self.answer_vocab.word2idx(answer)

        image_path = os.path.join(self.image_folder, image_id)
        image = Image.open(image_path).convert('RGB')
        if self.transform:
            image = self.transform(image)

        return torch.tensor(question_indices, dtype=torch.long), image, torch.tensor(answer_idx, dtype=torch.long)

In [12]:
question_vocab = Vocab(questions)
answer_vocab = Vocab(answers)

answer_vocab.vocab
question_vocab.vocab
len(question_vocab.vocab), len(answer_vocab.vocab)

(127, 29)

In [13]:
def show_images_with_questions(data, num_samples=5, root=f'{ROOT}/images'):
    unique_images = data["image_path"].unique()[:num_samples]  # L·∫•y N ·∫£nh ƒë·∫ßu ti√™n
    
    for img_path in unique_images:
        # L·∫•y danh s√°ch c√¢u h·ªèi v√† c√¢u tr·∫£ l·ªùi li√™n quan ƒë·∫øn ·∫£nh
        related_questions = data[data["image_path"] == img_path][["question", "answer"]]
        
        # Load v√† hi·ªÉn th·ªã ·∫£nh
        img = Image.open(f'{root}/{img_path}')
        plt.figure(figsize=(5, 5))
        plt.imshow(img)
        plt.axis("off")

        # In c√°c c√¢u h·ªèi v√† c√¢u tr·∫£ l·ªùi
        print(f"üì∑ **H√¨nh ·∫£nh:** {img_path}")
        for _, row in related_questions.iterrows():
            print(f"‚ùì {row['question']}")
            print(f"‚úÖ {row['answer']}")
            print("-" * 50)

        plt.show()

# Hi·ªÉn th·ªã ·∫£nh v·ªõi c√¢u h·ªèi
# show_images_with_questions(train_df, num_samples=3)

In [14]:
def collate_fn(batch, PAD_IDX=0):
    questions, images, answers = zip(*batch) 
    
    questions = [torch.tensor(q) for q in questions]
    padded_questions = pad_sequence(questions, batch_first=True, padding_value=PAD_IDX)

    answer = [torch.tensor(a) for a in answers]
    # answers = [torch.tensor(a) for a in answers]
    # padded_answers = pad_sequence(answers, batch_first=True, padding_value=PAD_IDX)
    
    images = torch.stack(images)  
    answer = torch.tensor(answer) 

    return padded_questions, images, answer

In [15]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [16]:
train_dataset = VQADataset(train_df, question_vocab, answer_vocab, transform=transform)
val_dataset = VQADataset(val_df, question_vocab, answer_vocab, transform=transform)
test_dataset = VQADataset(test_df, question_vocab, answer_vocab, transform=transform)

len(train_dataset)

41613

In [17]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn, drop_last=True, num_workers=10)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn, drop_last=True, num_workers=10)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn, drop_last=True, num_workers=10)

In [18]:
class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt', trace_func=print):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement.
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
            path (str): Path for the checkpoint to be saved to.
                            Default: 'checkpoint.pt'
            trace_func (function): trace print function.
                            Default: print
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_val_loss = None
        self.early_stop = False
        self.val_loss_min = np.inf
        self.delta = delta
        self.path = path
        self.trace_func = trace_func

    def __call__(self, val_loss, model):
        # Check if validation loss is nan
        if np.isnan(val_loss):
            self.trace_func("Validation loss is NaN. Ignoring this epoch.")
            return

        if self.best_val_loss is None:
            self.best_val_loss = val_loss
            self.save_checkpoint(val_loss, model)
        elif val_loss < self.best_val_loss - self.delta:
            # Significant improvement detected
            self.best_val_loss = val_loss
            self.save_checkpoint(val_loss, model)
            self.counter = 0  # Reset counter since improvement occurred
        else:
            # No significant improvement
            self.counter += 1
            self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decreases.'''
        if self.verbose:
            self.trace_func(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

In [19]:
def update_history(history, train_loss, val_loss, train_acc, val_acc):
    history["train_loss"].append(train_loss)
    history["val_loss"].append(val_loss)
    history["train_acc"].append(train_acc)
    history["val_acc"].append(val_acc)

def log_training(epoch, epochs, train_loss, train_acc, val_loss, val_acc, early_stopping, start_time):
    end_time = time.time()
    print(f"{'-' * 50}")
    print(f"Epoch: {epoch + 1}/{epochs}:")
    print(f"\tTrain Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
    print(f"\tVal Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
    print(f"\tEarly Stopping Counter: {early_stopping.counter}, Time: {end_time - start_time:.2f}s")

In [20]:
def validate(model, val_loader, criterion, device):
    model.eval()
    val_loss = 0.0
    correct_val = 0
    total_val = 0

    with torch.no_grad():
        for questions, images, answers in val_loader:
            questions, images, answers = questions.to(device), images.to(device), answers.to(device)

            output = model(questions, images)  # [batch, num_answer]

            loss = criterion(output, answers)
            val_loss += loss.item()

            # T√≠nh accuracy
            predicted = torch.argmax(output, dim=-1)  # [batch]
            correct_val += (predicted == answers).sum().item()
            total_val += answers.size(0)

    val_loss /= len(val_loader)
    val_acc = correct_val / total_val

    return val_loss, val_acc

In [21]:
def train(model, train_loader, val_loader, criterion, optimizer, scheduler, epochs, device, early_stopping):
    history = {
        "train_loss": [],
        "val_loss": [],
        "train_acc": [],
        "val_acc": []
    }

    print('Start training...')
    for epoch in range(epochs):
        start_time = time.time()
        
        model.train()
        train_loss = 0.0
        correct_train = 0
        total_train = 0

        for questions, images, answers in train_loader:
            questions, images, answers = questions.to(device), images.to(device), answers.to(device)

            optimizer.zero_grad()
            output = model(questions, images)  # [batch_size, num_answer]
            

            loss = criterion(output, answers)  # [batch, num_answer] v√† [batch]
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()

            # T√≠nh accuracy
            predicted = torch.argmax(output, dim=-1)  # [batch]
            correct_train += (predicted == answers).sum().item()
            total_train += answers.size(0)

        train_loss /= len(train_loader)
        train_acc = correct_train / total_train

        # Validation
        val_loss, val_acc = validate(model, val_loader, criterion, device)

        update_history(history, train_loss, val_loss, train_acc, val_acc)
        # scheduler.step(val_loss)

        early_stopping(val_loss, model)
        if early_stopping.early_stop:
            print("Early stopping triggered")
            break
    
        log_training(epoch, epochs, train_loss, train_acc, val_loss, val_acc, early_stopping, start_time)

    return history


In [22]:
def test(model, test_loader, criterion, device):
    model.eval()
    test_loss = 0.0
    correct = 0
    total = 0
    predictions = []

    with torch.no_grad():
        for questions, images, answers in test_loader:
            questions, images, answers = questions.to(device), images.to(device), answers.to(device)

            output = model(questions, images)  # [batch, num_answer]

            # T√≠nh loss
            loss = criterion(output, answers)
            test_loss += loss.item()

            # L·∫•y nh√£n d·ª± ƒëo√°n
            predicted = torch.argmax(output, dim=-1)  # [batch]
            predictions.append(predicted.cpu().numpy())  # Chuy·ªÉn sang numpy ƒë·ªÉ ph√¢n t√≠ch

            # T√≠nh s·ªë l∆∞·ª£ng ƒë√∫ng
            correct += (predicted == answers).sum().item()
            total += answers.size(0)

    test_loss /= len(test_loader)
    test_acc = correct / total

    print(f"Test Loss: {test_loss:.4f} | Test Accuracy: {test_acc:.4f}")

    return test_loss, test_acc, predictions


# Modeling

In [23]:
class ImageEncoder(nn.Module):
    def __init__(self, pretrained=True):
        super(ImageEncoder, self).__init__()
        mobilenet = models.mobilenet_v2(pretrained=pretrained)
        self.feature_extractor = mobilenet.features
        self.pool = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Linear(1280, 256)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        features = self.feature_extractor(x)
        features = self.pool(features).view(features.size(0), -1)
        features = self.fc(features)
        return self.dropout(features)

class TextEncoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim=256, hidden_dim=256):
        super(TextEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, 256)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        hidden = lstm_out[:, -1, :]
        hidden = self.fc(hidden)
        return self.dropout(hidden)

class VQA(nn.Module):
    def __init__(self, vocab_size, num_classes):
        super(VQA, self).__init__()

        self.image_encoder = ImageEncoder()
        self.text_encoder = TextEncoder(vocab_size)

        self.fc_fusion = nn.Linear(256, 256)
        self.classifier = nn.Linear(256, num_classes)

    def forward(self, question, image):
        img_features = self.image_encoder(image)
        text_features = self.text_encoder(question)

        fused = self.fc_fusion(img_features * text_features)

        return self.classifier(fused)


## Hyperparameters

In [24]:
model = VQA(
    vocab_size=question_vocab.size,
    num_classes=answer_vocab.size
)

model.to(device)

if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)

Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /root/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 13.6M/13.6M [00:00<00:00, 143MB/s]


Using 2 GPUs!


In [25]:
epochs = 100
criterion = nn.CrossEntropyLoss(ignore_index=question_vocab.PAD_IDX)
optimizer = optim.Adam(model.parameters(), lr=5e-4, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=5, factor=0.5)

# early_stopping = EarlyStopping() 
early_stopping = EarlyStopping(patience=8, verbose=True)


# Training and Evaluation

## Training

In [None]:
history = train(model, train_loader, val_loader, criterion, optimizer, scheduler, epochs, device, early_stopping)

Start training...
Validation loss decreased (inf --> 0.745821).  Saving model ...
--------------------------------------------------
Epoch: 1/100:
	Train Loss: 0.9610, Train Acc: 0.6765
	Val Loss: 0.7221, Val Acc: 0.7458
	Early Stopping Counter: 0, Time: 193.49s
EarlyStopping counter: 1 out of 12
--------------------------------------------------
Epoch: 2/100:
	Train Loss: 0.6454, Train Acc: 0.7759
	Val Loss: 0.6192, Val Acc: 0.7793
	Early Stopping Counter: 1, Time: 188.73s


## Testing

In [None]:
model.load_state_dict(torch.load(early_stopping.path))
test_loss, test_acc, predictions = test(model, test_loader, criterion, device)

In [None]:
def plot_training_history(history, title="Training History"):
    epochs = range(1, len(history["train_loss"]) + 1)
    
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    fig.suptitle(title)
    
    # V·∫Ω Train Loss v√† Eval Loss
    axes[0].plot(epochs, history["train_loss"], label="Train Loss")
    axes[0].plot(epochs, history["val_loss"], label="Val Loss")
    axes[0].set_xlabel("Epochs")
    axes[0].set_ylabel("Loss")
    axes[0].set_title("Training and Validation Loss")
    axes[0].legend()
    axes[0].grid(True)
    
    # V·∫Ω Train Accuracy v√† Eval Accuracy
    axes[1].plot(epochs, history["train_acc"], label="Train Accuracy")
    axes[1].plot(epochs, history["val_acc"], label="Val Accuracy")
    axes[1].set_xlabel("Epochs")
    axes[1].set_ylabel("Accuracy")
    axes[1].set_title("Training and Validation Accuracy")
    axes[1].legend()
    axes[1].grid(True)
    
    plt.tight_layout()
    plt.show()

In [None]:
plot_training_history(history, title='Vqa early fusion')

In [None]:
for i in [random.randint(0, len(test_dataset) - 1) for _ in range(20)]:
    model.eval()
    if isinstance(model, torch.nn.DataParallel):
        model.module.eval()
    question, image, answer = test_dataset[i]
    
    question, image = question.to(device), image.to(device)

    with torch.no_grad():
        output = model(question.unsqueeze(0), image.unsqueeze(0))  
        predicted_idx = torch.argmax(output, dim=1).item()  

    question_text = test_dataset.data.iloc[i]['question']
    answer_text = test_dataset.data.iloc[i]['answer']

    predicted_answer = test_dataset.answer_vocab.idx2word(predicted_idx)

    image_np = image.cpu().permute(1, 2, 0).numpy()  #
    image_np = (image_np - image_np.min()) / (image_np.max() - image_np.min())

    plt.imshow(image_np)
    plt.axis("off")
    plt.show()

    print(f"Question: {question_text}")
    print(f"GT Answer: {answer_text}")
    print(f"Predicted Answer: {predicted_answer}")
    print("-" * 50)
