In [42]:
#@title Setup & Imports
import os
import zipfile
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
import json
# import nltk
# from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
# from nltk.translate.meteor_score import meteor_score
import matplotlib.pyplot as plt
from tqdm import tqdm
import requests
import io
import random
from src.config import *
from src.dataset import load_csv_paths, create_dataloaders
from src.utils import build_vocab
from src.model import GRNN
from src.train import train_model, plot_history
from src.eval import test_loss, generate_question
from src.dataset import VQGTensorDataset
import torch.nn as nn
import torch.optim as optim

torch.manual_seed(42)
np.random.seed(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cpu


In [None]:
# import nltk
# import os

# nltk.download('punkt', download_dir='~/nltk_data')
# nltk.data.path.append(os.path.expanduser('~/nltk_data'))


[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>


In [39]:
def extract_if_zip(zip_path, extract_to=None):
    if not zip_path.endswith(".zip"):
        return zip_path  # already a folder

    extract_to = extract_to or zip_path.replace(".zip", "")
    if not os.path.exists(extract_to):
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
    return extract_to

In [40]:
# Direct paths to CSVs
train_csv = "data/bing_data/bing_redistributed_indexes/bing_train_tensor_index.csv"
val_csv   = "data/bing_data/bing_redistributed_indexes/bing_val_tensor_index.csv"
test_csv  = "data/bing_data/bing_redistributed_indexes/bing_test_tensor_index.csv"

# Read all questions from all splits
questions = []
for path in [train_csv, val_csv, test_csv]:
    df = pd.read_csv(path)
    questions.extend(df["questions"].dropna().tolist())


In [None]:
import re
from collections import Counter

def build_vocab(questions, min_freq=1):
    counter = Counter()
    for q in questions:
        tokens = re.findall(r"\w+|[^\w\s]", str(q).lower(), re.UNICODE)
        counter.update(tokens)
    vocab = {'<pad>': 0, '<start>': 1, '<end>': 2, '<unk>': 3}
    idx = 4
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = idx
            idx += 1
    return vocab, {i: w for w, i in vocab.items()}

In [43]:
vocab, idx_to_word = build_vocab(questions)

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/Users/jacobfernandez/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.12/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.12/share/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.12/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - '/Users/jacobfernandez/nltk_data'
**********************************************************************


In [None]:
import os
import re
import pandas as pd
import torch
from torch.utils.data import Dataset

class VQGTensorDataset(Dataset):
    def __init__(self, csv_path, vocab, max_length=20, base_dir=""):
        self.df = pd.read_csv(csv_path)
        self.vocab = vocab
        self.max_length = max_length
        self.base_dir = base_dir

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        tensor_path = os.path.join(self.base_dir, row["tensor_path"])
        image_tensor = torch.load(tensor_path).float()
        
        question = str(row["questions"]).lower()
        tokens = re.findall(r"\w+|[^\w\s]", question, re.UNICODE)

        indices = [self.vocab.get(token, self.vocab['<unk>']) for token in tokens]
        indices = [self.vocab['<start>']] + indices + [self.vocab['<end>']]
        indices = indices[:self.max_length] + [self.vocab['<pad>']] * (self.max_length - len(indices))
        
        return image_tensor, torch.tensor(indices), question


In [None]:
train_dataset = VQGTensorDataset(train_csv, vocab, MAX_LENGTH)
val_dataset   = VQGTensorDataset(val_csv, vocab, MAX_LENGTH)
test_dataset  = VQGTensorDataset(test_csv, vocab, MAX_LENGTH)

dataloaders = {
    "bing": {
        "train": DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True),
        "val":   DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False),
        "test":  DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False),
    }
}

In [24]:
model = GRNN(EMBED_SIZE, HIDDEN_SIZE, len(vocab)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LR)



In [25]:
history = train_model(model, dataloaders["bing"]["train"], dataloaders["bing"]["val"],
                      vocab, criterion, optimizer, device, EPOCHS)
plot_history(history)
test_loss(model, dataloaders["bing"]["test"], vocab, criterion, device)


KeyError: 'train'

In [26]:
dataloaders['bing']

{}

In [None]:
generate_question(model, dataloaders["bing"]["test"], vocab, idx_to_word, device)