# Image Captioning System:
### The systems performs the following:
1. Using pre-trained CNNs (ResNet-50) for feature extraction from images.
2. Creating an RNN architecture model which takes the extracted feature as the first hidden state and trains on NTP (Next Token Prediction) task using the cross entropy loss

In [None]:
#installing required dependencies
import numpy as np
import torch
import matplotlib.pyplot as plt
from tqdm import tqdm
import cv2
from torchvision.models import resnet50, ResNet50_Weights
from torchvision.models.feature_extraction import create_feature_extractor
from torchvision.io import read_image
import os
import re
from collections import Counter
from typing import List, Tuple, Dict, Optional, Any
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchtext.vocab import Vocab, vocab

### 1. Feature Extraction using ResNet-50 pretrained model

In [None]:
# Function for extracting features using a preloaded model
def extract_feature(image_path, feature_extractor, preprocess):
    # Load and preprocess the image
    image = read_image(image_path).unsqueeze(0)  # Add batch dimension (1, ...)
    transformed_image = preprocess(image)
    
    # Extract features
    with torch.no_grad():  # Disable gradient computation for efficiency
        features = feature_extractor(transformed_image)
    feature_vector = features['avgpool'].squeeze()
    return feature_vector.cpu().numpy()

In [None]:
def get_image_paths(base_path):
    """
    Walks through the base directory and returns a list of image file paths.
    """
    image_paths = []
    for root, dirs, files in os.walk(base_path):
        for file in files:
            if file.endswith(('.jpg', '.jpeg', '.png')):
                image_paths.append(os.path.join(root, file))
    return image_paths

def extract_features_from_dataset(image_paths, output_file="features.npy", batch_size=32):
    # Load the model and preprocessing transform only once
    model_weights = ResNet50_Weights.DEFAULT
    model = resnet50(weights=model_weights)
    model.eval()
    preprocess = model_weights.transforms()
    
    return_nodes = {'avgpool': 'avgpool'}
    feature_extractor = create_feature_extractor(model, return_nodes=return_nodes)
    
    # Process images in batches and save features incrementally
    features = []
    for i in tqdm(range(0, len(image_paths), batch_size), desc="Extracting feature vectors"):
        batch_paths = image_paths[i:i + batch_size]
        batch_features = [
            extract_feature(image_path, feature_extractor, preprocess) for image_path in batch_paths
        ]
        features.extend(batch_features)

        # Save to disk after every batch to avoid memory overflow
        np.save(output_file, np.array(features))
    return features

In [None]:
image_paths = get_image_paths("/kaggle/input/flickr8k/Images")
feature_vectors = extract_features_from_dataset(image_paths, output_file="/kaggle/working/features.npy")

In [None]:
#sentences format
sentence = List[str]

#Function for extracting captions
def extract_captions(datapath) -> Tuple[List[sentence], Dict[str, int]]:

    sentences: List[sentence] = []
    word_cnt: Dict[str,int] = Counter()

    for sentence_txt in open(datapath).read.split("\n"): #each (img, sentence) pair is in a different line
        _, sentence = sentence_txt.split(",") 
        sentences.append([])
        for w in sentence.split(" "):
            w = w.lower() #remove uppercase letters
            w = re.sub(r"\W+", "", w) #remove special characters
            
            word_cnt[w] += 1
            sentences[-1].append(w)


    return sentences, wrd_cnt

In [None]:
class image_caption_dataset(Dataset):

    def __init__(datapath: str, word_vocab: Optional[Vocab] = None, feature_vectors) -> None:

        self.sentences: List[sentence] = None
        PAD = ""
        UNKNOWN = ""

        self.sentences, word_cnt = extract_captions(datapath)
        self.feature_vectors = feature_vectors

        if word_vocab = None:
        word_vocab = vocab(word_cnt, specials = [PAD, UNKNOWN])
        word_vocab.set_default_index(word_vocab[UNKNOWN]) 

        self.word_vocab = word_vocab
        self.unknown_idx = self.words_vocab[UNKNOWN]
        self.pad_idx = self.words_vocab[PAD]

    def __getitem__(self, idx: int) -> Sentence:
        """
        Get the idx'th sentence in the dataset.
        """
        return self.sentences[idx], self.feature_vectors[idx]

    def __len__(self) -> int:
        """
        Return the number of sentences in the dataset.
        """
        return len(self.sentences)

    def form_batch(self, sentences: List[sentence], feature_vectors) --> Dict[str, Any]:
        
        word: List[List[str]] = []
        max_len = -1
        for sent in sentences:
            word.append([])
            for w in sent:
                word[-1].append(w)
            max_len = max(max_len, len(word[-1]))  

        batch_size  = len(sentences)

        #now we need to fill word_idxs, valid_masks and feature vectors tensor
        word_idxs = torch.full((batch_size, max_len), fill_value= self.pad_idx, dtype= torch.int640)
        valid_mask =torch.zeros_like(word_idxs, dtype= torch.bool)
        feature_vectors = torch.full(())