In [1]:
!pip install pyvi timm
import os
import zipfile
import json
import random
from collections import Counter
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from torchvision import transforms, models
from PIL import Image
from pyvi import ViTokenizer
from transformers import BlipProcessor, BlipForQuestionAnswering
from google.colab import drive
import timm
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from torch.cuda.amp import GradScaler, autocast
from torch.optim.lr_scheduler import OneCycleLR
import matplotlib.image as mpimg



In [None]:
drive.mount('/content/drive')
project_dir = "/content/drive/MyDrive/Project_Gki"
os.makedirs(project_dir, exist_ok=True)
zip_path = os.path.join(project_dir, "StanfordCars.zip")
extracted_dir = "/content/stanford_cars"

if not os.path.exists(extracted_dir):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extracted_dir)
    print("Đã giải nén StanfordCars.zip")
else:
    print("Thư mục stanford_cars đã tồn tại, không cần giải nén.")

In [2]:
# === BƯỚC 2: XÂY DỰNG MÔ HÌNH VQA ===
# Chuẩn bị biến đổi ảnh
image_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.RandomCrop(224),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.05),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Lớp Attention
class Attention(nn.Module):
    def __init__(self, img_dim, question_dim, hidden_dim, num_heads=4, dropout=0.1):
        super(Attention, self).__init__()
        self.num_heads = num_heads
        self.hidden_dim = hidden_dim
        self.head_dim = hidden_dim // num_heads
        self.img_proj = nn.Linear(img_dim, hidden_dim)
        self.question_proj = nn.Linear(question_dim, hidden_dim)
        self.output_proj = nn.Linear(hidden_dim, hidden_dim)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(hidden_dim)

    def forward(self, img_features, question_features):
        batch_size = img_features.size(0)
        if len(img_features.shape) == 2:
            img_features = img_features.unsqueeze(1)
        seq_len = img_features.size(1)
        question_features = question_features.unsqueeze(1).expand(-1, seq_len, -1)
        img_proj = self.img_proj(img_features)
        question_proj = self.question_proj(question_features)
        img_heads = img_proj.view(batch_size, seq_len, self.num_heads, self.head_dim)
        question_heads = question_proj.view(batch_size, seq_len, self.num_heads, self.head_dim)
        attention_scores = torch.sum(img_heads * question_heads, dim=-1) / (self.head_dim ** 0.5)
        attention_weights = F.softmax(attention_scores, dim=1)
        attention_weights = self.dropout(attention_weights)
        attention_weights = attention_weights.unsqueeze(-1).expand(-1, -1, -1, self.head_dim)
        weighted_features = img_heads * attention_weights
        attended_features = weighted_features.sum(dim=1).reshape(batch_size, self.hidden_dim)
        attended_features = self.output_proj(attended_features)
        attended_features = self.dropout(attended_features)
        residual = self.img_proj(img_features.mean(dim=1))
        return self.layer_norm(attended_features + residual)

# ImageEncoder với ConvNeXt
class ImageEncoder(nn.Module):
    def __init__(self, img_dim, fine_tune=True, fine_tune_layers=None):
        super(ImageEncoder, self).__init__()
        self.base_model = timm.create_model('convnext_small', pretrained=True)
        feat_dim = self.base_model.head.fc.in_features
        self.base_model.head.fc = nn.Identity()
        self.fc = nn.Sequential(
            nn.Linear(feat_dim, img_dim),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        if not fine_tune:
            for param in self.base_model.parameters():
                param.requires_grad = False
        elif fine_tune and fine_tune_layers is not None:
            layers_to_freeze = len(list(self.base_model.stages)) - fine_tune_layers
            for i, stage in enumerate(self.base_model.stages):
                if i < layers_to_freeze:
                    for param in stage.parameters():
                        param.requires_grad = False

    def forward(self, images):
        features = self.base_model(images)
        return self.fc(features)

# QuestionEncoder với LSTM hai chiều
class QuestionEncoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers=2, dropout=0.3):
        super(QuestionEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size // 2, num_layers=num_layers, batch_first=True, bidirectional=True, dropout=dropout if num_layers > 1 else 0)
        self.fc = nn.Linear(hidden_size, hidden_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, questions, lengths=None):
        embedded = self.dropout(self.embedding(questions))
        if lengths is not None and lengths.max() > 0:
            packed = pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
            _, (hidden, _) = self.lstm(packed)
        else:
            _, (hidden, _) = self.lstm(embedded)
        hidden_concat = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        return self.fc(hidden_concat)

# Decoder
class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, dropout=0.3):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTMCell(embed_size + hidden_size, hidden_size)
        self.fc = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(hidden_size)

    def forward(self, input_token, hidden, cell, context):
        embedded = self.dropout(self.embedding(input_token.view(-1))).squeeze(1)
        lstm_input = torch.cat([embedded, context], dim=1)
        hidden_next, cell_next = self.lstm(lstm_input, (hidden, cell))
        hidden_next = self.layer_norm(hidden_next)
        hidden_next = self.dropout(hidden_next)
        output = self.fc(hidden_next)
        return output, hidden_next, cell_next

# Mô hình VQA tổng thể
class VQAModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, img_dim, fine_tune=True, fine_tune_layers=None, dropout=0.3):
        super(VQAModel, self).__init__()
        self.image_encoder = ImageEncoder(img_dim, fine_tune, fine_tune_layers)
        self.question_encoder = QuestionEncoder(vocab_size, embed_size, hidden_size, dropout=dropout)
        self.attention = Attention(img_dim, hidden_size, hidden_size)
        self.fusion = nn.Sequential(
            nn.Linear(img_dim + hidden_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout)
        )
        self.decoder = Decoder(vocab_size, embed_size, hidden_size, dropout=dropout)
        self.vocab_size = vocab_size

    def forward(self, images, questions, answers=None, question_lengths=None, teacher_forcing_ratio=0.5, max_len=20):
        batch_size = images.size(0)
        device = images.device
        img_features = self.image_encoder(images)
        question_features = self.question_encoder(questions, question_lengths)
        attended_img = self.attention(img_features.unsqueeze(1), question_features)
        fused_features = self.fusion(torch.cat([attended_img, question_features], dim=1))
        hidden = fused_features
        cell = torch.zeros_like(hidden).to(device)
        if answers is not None:
            target_max_len = answers.size(1)
        else:
            target_max_len = max_len
        decoder_input = torch.tensor([token_to_id['<sos>']] * batch_size).unsqueeze(1).to(device)
        outputs = torch.zeros(batch_size, target_max_len, self.vocab_size).to(device)
        for t in range(target_max_len):
            output, hidden, cell = self.decoder(decoder_input, hidden, cell, fused_features)
            outputs[:, t] = output
            use_teacher_forcing = random.random() < teacher_forcing_ratio
            if use_teacher_forcing and answers is not None and t < answers.size(1) - 1:
                decoder_input = answers[:, t].unsqueeze(1)
            else:
                decoder_input = output.argmax(dim=1).unsqueeze(1)
        return outputs

In [4]:
# === BƯỚC 3: XỬ LÝ DỮ LIỆU ===
# Hàm chuyển văn bản thành chuỗi token
def text_to_sequence(text, token_to_id, max_len=None):
    tokens = ViTokenizer.tokenize(text.lower()).split()
    tokens.append('<eos>')
    if max_len and len(tokens) > max_len:
        tokens = tokens[:max_len-1] + ['<eos>']
    return [token_to_id.get(token, token_to_id['<unk>']) for token in tokens]

# Tạo từ vựng
def build_vocab(qa_pairs, min_freq=2):
    word_counts = Counter()
    for pair in qa_pairs:
        word_counts.update(ViTokenizer.tokenize(pair['question'].lower()).split())
        word_counts.update(ViTokenizer.tokenize(pair['answer'].lower()).split())
    common_words = [word for word, count in word_counts.items() if count >= min_freq]
    vocab = ['<pad>', '<sos>', '<eos>', '<unk>'] + common_words
    token_to_id = {token: idx for idx, token in enumerate(vocab)}
    id_to_token = {idx: token for token, idx in token_to_id.items()}
    print(f"Kích thước từ điển: {len(vocab)} từ")
    return token_to_id, id_to_token

# Chia dữ liệu
def split_data(qa_pairs, val_size=0.1, test_size=0.1):
    image_to_qa = {}
    for qa in qa_pairs:
        image = qa['image']
        if image not in image_to_qa:
            image_to_qa[image] = []
        image_to_qa[image].append(qa)
    images = list(image_to_qa.keys())
    train_imgs, temp_imgs = train_test_split(images, test_size=(val_size+test_size), random_state=42)
    val_imgs, test_imgs = train_test_split(temp_imgs, test_size=test_size/(val_size+test_size), random_state=42)
    train_qa = [qa for img in train_imgs for qa in image_to_qa[img]]
    val_qa = [qa for img in val_imgs for qa in image_to_qa[img]]
    test_qa = [qa for img in test_imgs for qa in image_to_qa[img]]
    print(f"Tập huấn luyện: {len(train_qa)} cặp QA ({len(train_imgs)} ảnh)")
    print(f"Tập kiểm định: {len(val_qa)} cặp QA ({len(val_imgs)} ảnh)")
    print(f"Tập kiểm tra: {len(test_qa)} cặp QA ({len(test_imgs)} ảnh)")
    return train_qa, val_qa, test_qa

# Dataset
class ImprovedVQADataset(Dataset):
    def __init__(self, qa_pairs, image_dir, token_to_id, max_len=20, transform=None):
        self.qa_pairs = qa_pairs
        self.image_dir = image_dir
        self.token_to_id = token_to_id
        self.max_len = max_len
        self.transform = transform if transform else image_transform

    def __len__(self):
        return len(self.qa_pairs)

    def __getitem__(self, idx):
        try:
            pair = self.qa_pairs[idx]
            image_path = os.path.join(self.image_dir, pair['image'])
            image = Image.open(image_path).convert('RGB')
            image = self.transform(image)
            question = text_to_sequence(pair['question'], self.token_to_id, self.max_len)
            answer = text_to_sequence(pair['answer'], self.token_to_id, self.max_len)
            answer_input = [self.token_to_id['<sos>']] + answer[:-1]
            return {
                'image': image,
                'question': torch.tensor(question),
                'question_len': len(question),
                'answer': torch.tensor(answer),
                'answer_input': torch.tensor(answer_input),
                'answer_len': len(answer)
            }
        except Exception as e:
            print(f"Lỗi khi xử lý mục {idx}: {str(e)}")
            dummy_image = torch.zeros(3, 224, 224)
            dummy_seq = torch.tensor([self.token_to_id['<unk>'], self.token_to_id['<eos>']])
            return {
                'image': dummy_image,
                'question': dummy_seq,
                'question_len': 2,
                'answer': dummy_seq,
                'answer_input': dummy_seq,
                'answer_len': 2
            }

# Hàm collate cho DataLoader
def improved_collate_fn(batch):
    batch = sorted(batch, key=lambda x: x['question_len'], reverse=True)
    max_answer_len = max(item['answer_len'] for item in batch)
    max_question_len = max(item['question_len'] for item in batch)
    images = torch.stack([item['image'] for item in batch])
    questions = []
    question_lengths = []
    answers = []
    answer_inputs = []
    for item in batch:
        q_padded = F.pad(item['question'], (0, max_question_len - item['question_len']), 'constant', 0)
        questions.append(q_padded)
        question_lengths.append(item['question_len'])
        a_padded = F.pad(item['answer'], (0, max_answer_len - item['answer_len']), 'constant', 0)
        answers.append(a_padded)
        a_in_padded = F.pad(item['answer_input'], (0, max_answer_len - item['answer_len']), 'constant', 0)
        answer_inputs.append(a_in_padded)
    return (torch.stack(questions),
            torch.tensor(question_lengths),
            torch.stack(answers),
            torch.stack(answer_inputs),
            images)

# Load dữ liệu từ file JSON
IMAGE_DIR = os.path.join("/content/stanford_cars/cars_train/cars_train")
OUTPUT_FILE = "/content/questions_answers_vqa.json"  # Đường dẫn tới file JSON của bạn
with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
    qa_pairs = json.load(f)

# Chia dữ liệu
train_qa, val_qa, test_qa = split_data(qa_pairs)
token_to_id, id_to_token = build_vocab(train_qa, min_freq=2)

# Tạo dataset
train_dataset = ImprovedVQADataset(train_qa, IMAGE_DIR, token_to_id, max_len=20, transform=image_transform)
val_dataset = ImprovedVQADataset(val_qa, IMAGE_DIR, token_to_id, max_len=20, transform=test_transform)
test_dataset = ImprovedVQADataset(test_qa, IMAGE_DIR, token_to_id, max_len=20, transform=test_transform)

# Tạo DataLoader
BATCH_SIZE = 32
train_dataloader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=improved_collate_fn,
    num_workers=2,
    pin_memory=True
)
val_dataloader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=improved_collate_fn,
    num_workers=2,
    pin_memory=True
)
test_dataloader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=improved_collate_fn,
    num_workers=2,
    pin_memory=True
)

print(f"Tạo DataLoader thành công: {len(train_dataloader)} batch huấn luyện, {len(val_dataloader)} batch kiểm định, {len(test_dataloader)} batch kiểm tra.")

Tập huấn luyện: 32575 cặp QA (6515 ảnh)
Tập kiểm định: 4070 cặp QA (814 ảnh)
Tập kiểm tra: 4075 cặp QA (815 ảnh)
Kích thước từ điển: 197 từ
Tạo DataLoader thành công: 1018 batch huấn luyện, 128 batch kiểm định, 128 batch kiểm tra.


In [None]:
# === BƯỚC 4: HUẤN LUYỆN VÀ ĐÁNH GIÁ ===
# Định nghĩa thiết bị
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Sử dụng thiết bị: {device}")

# Hàm huấn luyện một epoch
def train_epoch(model, dataloader, optimizer, criterion, device, teacher_forcing_ratio, scaler, accumulation_steps=1):
    model.train()
    total_loss = 0
    optimizer.zero_grad()
    for i, (questions, question_lengths, answers, answer_inputs, images) in enumerate(tqdm(dataloader, desc="Training")):
        images = images.to(device)
        questions = questions.to(device)
        question_lengths = question_lengths.to(device)
        answers = answers.to(device)
        answer_inputs = answer_inputs.to(device)
        with autocast():
            outputs = model(images, questions, answer_inputs, question_lengths, teacher_forcing_ratio=teacher_forcing_ratio)
            batch_size, seq_len, vocab_size = outputs.size()
            outputs = outputs.view(batch_size * seq_len, vocab_size)
            answers = answers.view(batch_size * seq_len)
            loss = criterion(outputs, answers)
        scaler.scale(loss / accumulation_steps).backward()
        if (i + 1) % accumulation_steps == 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
        total_loss += loss.item()
    return total_loss / len(dataloader)

# Hàm đánh giá
def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for questions, question_lengths, answers, answer_inputs, images in tqdm(dataloader, desc="Evaluating"):
            images = images.to(device)
            questions = questions.to(device)
            question_lengths = question_lengths.to(device)
            answers = answers.to(device)
            answer_inputs = answer_inputs.to(device)
            with autocast():
                outputs = model(images, questions, answer_inputs, question_lengths, teacher_forcing_ratio=0.0)
                batch_size, seq_len, vocab_size = outputs.size()
                outputs = outputs.view(batch_size * seq_len, vocab_size)
                answers = answers.view(batch_size * seq_len)
                loss = criterion(outputs, answers)
            total_loss += loss.item()
    return total_loss / len(dataloader)

# Thiết lập tham số
VOCAB_SIZE = len(token_to_id)
EMBED_SIZE = 256
HIDDEN_SIZE = 512
IMG_DIM = 512
NUM_EPOCHS = 10
LEARNING_RATE = 0.001
ACCUMULATION_STEPS = 2

# Khởi tạo mô hình
model = VQAModel(
    vocab_size=VOCAB_SIZE,
    embed_size=EMBED_SIZE,
    hidden_size=HIDDEN_SIZE,
    img_dim=IMG_DIM,
    fine_tune=True,
    fine_tune_layers=4,
    dropout=0.2
).to(device)

# Loss và optimizer
criterion = nn.CrossEntropyLoss(label_smoothing=0.1, ignore_index=token_to_id['<pad>'])
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-5)
scheduler = OneCycleLR(optimizer, max_lr=0.01, total_steps=NUM_EPOCHS * len(train_dataloader) // ACCUMULATION_STEPS, pct_start=0.3)
scaler = GradScaler()

# Early stopping
patience = 3
no_improve = 0
best_val_loss = float('inf')
train_losses = []
val_losses = []

# Huấn luyện
for epoch in range(NUM_EPOCHS):
    print(f"Epoch {epoch+1}/{NUM_EPOCHS}")
    teacher_forcing_ratio = max(0.7 - (epoch // 2) * 0.05, 0.0)
    train_loss = train_epoch(model, train_dataloader, optimizer, criterion, device, teacher_forcing_ratio, scaler, ACCUMULATION_STEPS)
    train_losses.append(train_loss)
    val_loss = evaluate(model, val_dataloader, criterion, device)
    val_losses.append(val_loss)
    scheduler.step()
    print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | LR: {optimizer.param_groups[0]['lr']}")
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        no_improve = 0
        torch.save(model.state_dict(), '/content/drive/MyDrive/Project_Gki/best_vqa_model.pth')
        print("Đã lưu mô hình tốt nhất!")
    else:
        no_improve += 1
        if no_improve >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

# Vẽ biểu đồ loss
plt.figure(figsize=(10, 5))
plt.plot(range(1, len(train_losses) + 1), train_losses, label='Train Loss')
plt.plot(range(1, len(val_losses) + 1), val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.grid(True)
plt.show()

# Đánh giá trên tập test
test_loss = evaluate(model, test_dataloader, criterion, device)
print(f"Test Loss: {test_loss:.4f}")

# Lưu mô hình cuối cùng
torch.save(model.state_dict(), '/content/drive/MyDrive/Project_Gki/final_vqa_model.pth')
print("Đã lưu mô hình cuối cùng!")

Sử dụng thiết bị: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Epoch 1/10


  scaler = GradScaler()


Training:   0%|          | 0/1018 [00:00<?, ?it/s]

  with autocast():


Evaluating:   0%|          | 0/128 [00:00<?, ?it/s]

  with autocast():


Train Loss: 1.8610 | Val Loss: 1.8426 | LR: 0.0004000101718926105
Đã lưu mô hình tốt nhất!
Epoch 2/10


Training:   0%|          | 0/1018 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/128 [00:00<?, ?it/s]

Train Loss: 1.7626 | Val Loss: 1.8286 | LR: 0.0004000406875273293
Đã lưu mô hình tốt nhất!
Epoch 3/10


Training:   0%|          | 0/1018 [00:00<?, ?it/s]