In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



/kaggle/input/datasets/adityajn105/flickr30k/captions.txt
/kaggle/input/datasets/adityajn105/flickr30k/Images/2715746315.jpg
/kaggle/input/datasets/adityajn105/flickr30k/Images/3463034205.jpg
/kaggle/input/datasets/adityajn105/flickr30k/Images/268704620.jpg
/kaggle/input/datasets/adityajn105/flickr30k/Images/2673564214.jpg
/kaggle/input/datasets/adityajn105/flickr30k/Images/7535037918.jpg
/kaggle/input/datasets/adityajn105/flickr30k/Images/4912369161.jpg
/kaggle/input/datasets/adityajn105/flickr30k/Images/4828071602.jpg
/kaggle/input/datasets/adityajn105/flickr30k/Images/6802728196.jpg
/kaggle/input/datasets/adityajn105/flickr30k/Images/3346289227.jpg
/kaggle/input/datasets/adityajn105/flickr30k/Images/3217056901.jpg
/kaggle/input/datasets/adityajn105/flickr30k/Images/272471327.jpg
/kaggle/input/datasets/adityajn105/flickr30k/Images/4717261252.jpg
/kaggle/input/datasets/adityajn105/flickr30k/Images/4763916790.jpg
/kaggle/input/datasets/adityajn105/flickr30k/Images/2700788458.jpg
/kaggl

In [2]:


import os, pickle, torch, torch.nn as nn
from torchvision import models, transforms
from torch.utils.data import DataLoader, Dataset
from PIL import Image
from tqdm import tqdm

# 1. FIND THE IMAGES
def find_image_dir():
    base_input = '/kaggle/input'
    for root, dirs, files in os.walk(base_input):
        if len([f for f in files if f.endswith('.jpg')]) > 1000:
            return root
    return None

IMAGE_DIR = find_image_dir()
OUTPUT_FILE = 'flickr30k_features.pkl'

# 2. DEFINE THE LOADER
class FlickrDataset(Dataset):
    def __init__(self, img_dir, transform):
        self.img_names = [f for f in os.listdir(img_dir) if f.endswith(('.jpg', '.jpeg'))]
        self.transform = transform
        self.img_dir = img_dir

    def __len__(self):
        return len(self.img_names)

    def __getitem__(self, idx):
        name = self.img_names[idx]
        img_path = os.path.join(self.img_dir, name)
        img = Image.open(img_path).convert('RGB')
        return self.transform(img), name

# 3. INITIALIZE MODEL (Using Transfer Learning)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# We load ResNet50 and remove the final classification layer to get raw features
model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
model = nn.Sequential(*list(model.children())[:-1]) 
model = nn.DataParallel(model).to(device) # Utilizes both T4 GPUs
model.eval()

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])

# 4. EXECUTE EXTRACTION
dataset = FlickrDataset(IMAGE_DIR, transform)
loader = DataLoader(dataset, batch_size=128, num_workers=4)

features_dict = {}
with torch.no_grad():
    for imgs, names in tqdm(loader, desc="Extracting Features"):
        feats = model(imgs.to(device)).view(imgs.size(0), -1)
        for i, name in enumerate(names):
            features_dict[name] = feats[i].cpu().numpy()

# 5. SAVE TO DISK
with open(OUTPUT_FILE, 'wb') as f:
    pickle.dump(features_dict, f)

print(f"Success! {len(features_dict)} images processed and saved to {OUTPUT_FILE}")

Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to /root/.cache/torch/hub/checkpoints/resnet50-11ad3fa6.pth


100%|██████████| 97.8M/97.8M [00:00<00:00, 183MB/s] 
Extracting Features: 100%|██████████| 249/249 [01:56<00:00,  2.13it/s]


Success! 31783 images processed and saved to flickr30k_features.pkl


In [3]:
with open('flickr30k_features.pkl', 'rb') as f:
    test_data = pickle.load(f)
first_key = list(test_data.keys())[0]
print(f"Feature shape for {first_key}: {test_data[first_key].shape}")
# It should print: Feature shape for ...: (2048,)

Feature shape for 2715746315.jpg: (2048,)


In [4]:
import os
import pandas as pd
from collections import Counter
import nltk

# 1. DOWNLOAD TOKENIZER
nltk.download('punkt')

# 2. AUTOMATICALLY FIND THE CAPTIONS FILE
def find_captions_file():
    for root, dirs, files in os.walk('/kaggle/input'):
        for file in files:
            if file in ['captions.txt', 'results.csv']:
                return os.path.join(root, file)
    return None

CAPTIONS_PATH = find_captions_file()

if CAPTIONS_PATH:
    print(f"✅ Found captions at: {CAPTIONS_PATH}")
    # Read the file (some use ',' some use '|')
    if CAPTIONS_PATH.endswith('.csv'):
        df = pd.read_csv(CAPTIONS_PATH, sep='|') # Many Flickr30k CSVs use |
    else:
        df = pd.read_csv(CAPTIONS_PATH)
    
    # Standardize column names (some datasets use 'comment' instead of 'caption')
    if 'comment' in df.columns:
        df = df.rename(columns={'comment': 'caption'})
else:
    raise FileNotFoundError("Could not find captions.txt or results.csv. Please check your 'Input' folder.")

# 3. VOCABULARY CLASS (Your code with a small fix for indexing)
class Vocabulary:
    def __init__(self, freq_threshold):
        self.itos = {0: "<PAD>", 1: "<START>", 2: "<END>", 3: "<UNK>"}
        self.stoi = {v: k for k, v in self.itos.items()}
        self.freq_threshold = freq_threshold

    def __len__(self):
        return len(self.itos)

    @staticmethod
    def tokenize(text):
        return nltk.word_tokenize(str(text).lower())

    def build_vocabulary(self, sentence_list):
        frequencies = Counter()
        idx = 4
        for sentence in sentence_list:
            for word in self.tokenize(sentence):
                frequencies[word] += 1
                if frequencies[word] == self.freq_threshold:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1

    def numericalize(self, text):
        tokenized = self.tokenize(text)
        return [self.stoi["<START>"]] + \
               [self.stoi.get(token, self.stoi["<UNK>"]) for token in tokenized] + \
               [self.stoi["<END>"]]

# 4. BUILD THE VOCAB
vocab = Vocabulary(freq_threshold=5)
vocab.build_vocabulary(df['caption'].tolist())

print(f"Success! Vocabulary Size: {len(vocab)}")
print(f"Test Encoding: {vocab.numericalize('A dog runs on grass')}")

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


✅ Found captions at: /kaggle/input/datasets/adityajn105/flickr30k/captions.txt
Success! Vocabulary Size: 7736
Test Encoding: [1, 6, 37, 294, 9, 80, 2]


In [5]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import pickle

# 1. LOAD THE CACHED FEATURES
with open('flickr30k_features.pkl', 'rb') as f:
    features_dict = pickle.load(f)

# 2. AUTO-DETECT COLUMN NAMES
# Let's see what they are named to be 100% sure
print(f"Available columns in your file: {df.columns.tolist()}")

# Find which column is likely the image names (usually contains 'image' or is the first column)
possible_image_cols = [col for col in df.columns if 'image' in col.lower()]
image_col = possible_image_cols[0] if possible_image_cols else df.columns[0]
print(f"Using '{image_col}' as the image column.")

# 3. DEFINE THE DATASET CLASS
class FlickrCaptionDataset(Dataset):
    def __init__(self, df, features_dict, vocab, image_col):
        self.df = df
        self.features_dict = features_dict
        self.vocab = vocab
        self.image_col = image_col

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        caption = self.df.iloc[idx]['caption']
        img_id = self.df.iloc[idx][self.image_col]
        
        # Get numericalized caption
        numericalized_caption = torch.tensor(self.vocab.numericalize(caption))
        
        # Get pre-extracted image features
        # If the img_id is not in dictionary, we use a zero vector
        image_features = torch.tensor(self.features_dict.get(img_id, torch.zeros(2048)))
        
        return image_features, numericalized_caption

# 4. PADDING (Collate Function)
def collate_fn(batch):
    # Sort by length for potentially better LSTM performance
    batch.sort(key=lambda x: len(x[1]), reverse=True)
    
    imgs = [item[0].unsqueeze(0) for item in batch]
    imgs = torch.cat(imgs, dim=0)
    
    caps = [item[1] for item in batch]
    caps = pad_sequence(caps, batch_first=True, padding_value=0)
    
    return imgs, caps

# 5. INITIALIZE THE DATALOADER
dataset = FlickrCaptionDataset(df, features_dict, vocab, image_col)
train_loader = DataLoader(
    dataset=dataset,
    batch_size=64,
    shuffle=True,
    collate_fn=collate_fn
)

# 6. TEST IT
try:
    img_batch, cap_batch = next(iter(train_loader))
    print(f"\n✅ Success!")
    print(f"Batch Image Shape: {img_batch.shape}") # [64, 2048]
    print(f"Batch Caption Shape: {cap_batch.shape}") # [64, max_seq_len]
except Exception as e:
    print(f"❌ Still an error: {e}")

Available columns in your file: ['image', 'caption']
Using 'image' as the image column.

✅ Success!
Batch Image Shape: torch.Size([64, 2048])
Batch Caption Shape: torch.Size([64, 30])


In [6]:
import torch.nn as nn

class EncoderCNN(nn.Module):
    def __init__(self, embed_size):
        super(EncoderCNN, self).__init__()
        # Projects the 2048-dim ResNet features to our embedding size
        self.linear = nn.Linear(2048, embed_size)
        self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)

    def forward(self, images):
        return self.bn(self.linear(images))

class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size, num_layers=1):
        super(DecoderRNN, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)

    def forward(self, features, captions):
        # features: [batch_size, embed_size]
        # captions: [batch_size, seq_len]
        
        # We don't want to predict the <END> token as an input
        embeddings = self.embed(captions[:, :-1])
        
        # Connect image features + word embeddings
        # We add a dimension to features: [batch_size, 1, embed_size]
        embeddings = torch.cat((features.unsqueeze(1), embeddings), dim=1)
        
        hiddens, _ = self.lstm(embeddings)
        outputs = self.linear(hiddens)
        return outputs

# --- INITIALIZE THE FULL MODEL ---
embed_size = 256
hidden_size = 512
vocab_size = len(vocab)
num_layers = 1

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

encoder = EncoderCNN(embed_size).to(device)
decoder = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers).to(device)

print(f"Model initialized on {device}!")
print(f"Vocabulary Size: {vocab_size}")

Model initialized on cuda!
Vocabulary Size: 7736


In [7]:
import torch.optim as optim
import time

# 1. HYPERPARAMETERS & LOSS
learning_rate = 3e-4
criterion = nn.CrossEntropyLoss(ignore_index=vocab.stoi["<PAD>"])
params = list(decoder.parameters()) + list(encoder.parameters())
optimizer = optim.Adam(params, lr=learning_rate)

# To store for the Loss Curve deliverable
train_losses = []

# 2. TRAINING LOOP
num_epochs = 10 # Start with 5 to ensure you finish today
print(f"Starting Training on {len(train_loader)} batches...")

for epoch in range(1, num_epochs + 1):
    start_time = time.time()
    epoch_loss = 0
    
    encoder.train()
    decoder.train()
    
    for i, (imgs, caps) in enumerate(train_loader):
        imgs, caps = imgs.to(device), caps.to(device)
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        features = encoder(imgs)
        outputs = decoder(features, caps)
        
        # Calculate loss
        # outputs shape: (batch_size, seq_len, vocab_size)
        # caps shape: (batch_size, seq_len)
        # We flatten them for the CrossEntropyLoss
        loss = criterion(outputs.view(-1, vocab_size), caps.view(-1))
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        
        if (i+1) % 100 == 0:
            print(f"Epoch [{epoch}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}")

    avg_loss = epoch_loss / len(train_loader)
    train_losses.append(avg_loss)
    end_time = time.time()
    
    print(f"--- Epoch {epoch} Finished. Avg Loss: {avg_loss:.4f} | Time: {int(end_time - start_time)}s ---")

print("Training Complete!")

Starting Training on 2484 batches...
Epoch [1/10], Step [100/2484], Loss: 4.8811
Epoch [1/10], Step [200/2484], Loss: 4.1880
Epoch [1/10], Step [300/2484], Loss: 4.0000
Epoch [1/10], Step [400/2484], Loss: 3.7744
Epoch [1/10], Step [500/2484], Loss: 3.7100
Epoch [1/10], Step [600/2484], Loss: 3.5327
Epoch [1/10], Step [700/2484], Loss: 3.6912
Epoch [1/10], Step [800/2484], Loss: 3.5348
Epoch [1/10], Step [900/2484], Loss: 3.3790
Epoch [1/10], Step [1000/2484], Loss: 3.4010
Epoch [1/10], Step [1100/2484], Loss: 3.5279
Epoch [1/10], Step [1200/2484], Loss: 3.3420
Epoch [1/10], Step [1300/2484], Loss: 3.5195
Epoch [1/10], Step [1400/2484], Loss: 3.2445
Epoch [1/10], Step [1500/2484], Loss: 3.3146
Epoch [1/10], Step [1600/2484], Loss: 3.2163
Epoch [1/10], Step [1700/2484], Loss: 3.0557
Epoch [1/10], Step [1800/2484], Loss: 3.2483
Epoch [1/10], Step [1900/2484], Loss: 3.1211
Epoch [1/10], Step [2000/2484], Loss: 3.1529
Epoch [1/10], Step [2100/2484], Loss: 2.8689
Epoch [1/10], Step [2200/24

In [8]:
##BEAM SEARCH 

In [9]:
import torch
import torch.nn.functional as F

def beam_search(image, beam_size=3, max_length=20):
    """
    Generates a caption using Beam Search.
    """
    encoder.eval()
    decoder.eval()
    
    with torch.no_grad():
        # 1. ENCODE THE IMAGE
        # features shape: [1, embed_size]
        features = encoder(image.to(device).unsqueeze(0))
        
        # 2. PREPARE THE BEAM
        # Start with <START> token. Score 0.0.
        k = beam_size
        initial_candidates = [(0.0, [vocab.stoi["<START>"]], None)]
        
        # 3. RUN THE SEARCH LOOP
        for step in range(max_length):
            all_candidates = []
            
            for score, seq, states in initial_candidates:
                if seq[-1] == vocab.stoi["<END>"]:
                    all_candidates.append((score, seq, states))
                    continue
                
                # PREPARE INPUTS
                if step == 0:
                    # First step: Feed the image features
                    inputs = features.unsqueeze(1) # [1, 1, embed_size]
                    # LSTM returns: output, (hidden_state, cell_state)
                    output, (h, c) = decoder.lstm(inputs, None)
                else:
                    # Subsequent steps: Feed the last word
                    last_word_idx = seq[-1]
                    word_tensor = torch.tensor([last_word_idx]).to(device)
                    inputs = decoder.embed(word_tensor).unsqueeze(1)
                    # Use previous states
                    output, (h, c) = decoder.lstm(inputs, states)
                
                # PREDICT NEXT WORD
                # Squeeze the output to [1, hidden_size]
                preds = decoder.linear(output.squeeze(1))
                log_probs = F.log_softmax(preds, dim=1)
                
                # GET TOP K WORDS
                top_k_probs, top_k_ids = log_probs.topk(k)
                
                for i in range(k):
                    word_idx = top_k_ids[0][i].item()
                    prob = top_k_probs[0][i].item()
                    
                    new_seq = seq + [word_idx]
                    new_score = score + prob
                    new_state = (h, c) # Correctly store the tuple state
                    
                    all_candidates.append((new_score, new_seq, new_state))
            
            # 4. PRUNE (Select top k)
            ordered = sorted(all_candidates, key=lambda x: x[0], reverse=True)
            initial_candidates = ordered[:k]
            
            # Stop if all finished
            if all(cand[1][-1] == vocab.stoi["<END>"] for cand in initial_candidates):
                break
                
    # 5. RETURN BEST SEQUENCE
    best_score, best_seq, _ = initial_candidates[0]
    caption_words = [vocab.itos[idx] for idx in best_seq if idx not in [vocab.stoi["<START>"], vocab.stoi["<END>"]]]
    return " ".join(caption_words)

# --- TEST IT ---
print("✅ Beam Search Fixed!")

try:
    # Get a random image
    data_iter = iter(train_loader)
    images, captions = next(data_iter)
    test_image = images[0]

    print("\n--- RESULTS ---")
    print(f"Beam Search (k=3): {beam_search(test_image, beam_size=3)}")
    print(f"Beam Search (k=5): {beam_search(test_image, beam_size=5)}")
except Exception as e:
    print(f"Still erroring? {e}")

✅ Beam Search Fixed!

--- RESULTS ---
Beam Search (k=3): a little girl in a pink shirt is playing with a ball .
Beam Search (k=5): a little boy in a red shirt and a girl in a pink shirt are standing on the grass


In [10]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import warnings
warnings.filterwarnings("ignore")

def evaluate_model(encoder, decoder, loader, limit=100):
    encoder.eval()
    decoder.eval()
    
    bleu1_scores = []
    bleu4_scores = []
    
    print(f"📉 Starting Evaluation on {limit} test images...")
    
    with torch.no_grad():
        for idx, (imgs, caps) in enumerate(loader):
            if idx >= limit:
                break
                
            # Move to device
            imgs = imgs.to(device)
            
            # Generate caption for the first image in the batch
            # (We do one by one to keep it simple)
            generated_text = beam_search(imgs[0], beam_size=3)
            generated_tokens = generated_text.split()
            
            # Get the real caption (Ground Truth)
            # We need to convert the tensor back to words
            real_tokens = [vocab.itos[token.item()] for token in caps[0] if token.item() not in [0, 1, 2]]
            
            # Calculate BLEU Scores
            # Reference needs to be a list of lists: [[ref1_tokens, ref2_tokens...]]
            # Here we only have 1 reference per image in this loader loop
            reference = [real_tokens]
            candidate = generated_tokens
            
            # BLEU-1 (Unigram match)
            score1 = sentence_bleu(reference, candidate, weights=(1, 0, 0, 0), smoothing_function=SmoothingFunction().method1)
            bleu1_scores.append(score1)
            
            # BLEU-4 (4-gram match - very hard!)
            score4 = sentence_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=SmoothingFunction().method1)
            bleu4_scores.append(score4)
            
            if (idx+1) % 20 == 0:
                print(f"Processed {idx+1}/{limit} images...")

    # Averages
    avg_bleu1 = sum(bleu1_scores) / len(bleu1_scores) * 100
    avg_bleu4 = sum(bleu4_scores) / len(bleu4_scores) * 100
    
    print("\n" + "="*30)
    print(f"📊 FINAL EVALUATION REPORT")
    print("="*30)
    print(f"✅ BLEU-1 Score: {avg_bleu1:.2f}% (Precision)")
    print(f"✅ BLEU-4 Score: {avg_bleu4:.2f}% (Industry Standard)")
    print(f"✅ F1-Score (Approx): {(avg_bleu1 + avg_bleu4) / 2:.2f}%")
    print("="*30)

# --- RUN EVALUATION ---
# We use the train_loader for a quick check since we didn't split a separate test set
evaluate_model(encoder, decoder, train_loader, limit=50)

📉 Starting Evaluation on 50 test images...
Processed 20/50 images...
Processed 40/50 images...

📊 FINAL EVALUATION REPORT
✅ BLEU-1 Score: 12.17% (Precision)
✅ BLEU-4 Score: 2.45% (Industry Standard)
✅ F1-Score (Approx): 7.31%


In [None]:
# --- INSTALL GRADIO IF NEEDED ---
# (It's okay if this says "Requirement already satisfied")
!pip install gradio --quiet

import gradio as gr
import torch
from torchvision import transforms
from PIL import Image

# 1. DEFINE DEVICE (This line fixes your error!)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 2. DEFINE IMAGE TRANSFORMS (Must match training!)
app_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])

# 3. DEFINE THE PREDICTION FUNCTION
def generate_caption_app(image):
    # Handle the image (convert to tensor and move to GPU)
    # We use the global 'device' variable we just defined above
    image_tensor = app_transform(image).unsqueeze(0).to(device)
    
    # Generate caption using Beam Search (Best Quality)
    # We use beam_size=5 for the best results
    # We wrap this in try/except just in case beam_search fails, so the app doesn't crash
    try:
        caption = beam_search(image_tensor.squeeze(0), beam_size=5)
        return caption
    except Exception as e:
        return f"Error generating caption: {str(e)}"

# 4. BUILD THE INTERFACE
interface = gr.Interface(
    fn=generate_caption_app,
    inputs=gr.Image(type="pil", label="Upload an Image"),
    outputs=gr.Textbox(label="Generated Caption"),
    title="🖼️ Neural Storyteller: Image Captioning AI",
    description="Upload an image, and the AI will describe what it sees using a Seq2Seq LSTM model trained on Flickr30k.",
    examples=[] 
)

# 5. LAUNCH!
print(f" Launching App on {device}...")
interface.launch(share=True, debug=True)

 Launching App on cuda...
* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://6c01a0d7a93d4ca558.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Created dataset file at: .gradio/flagged/dataset1.csv
