In [6]:
import torch 
import torch.nn as nn
import torchvision.models as models
from torch.utils.data import DataLoader,Dataset # custom datasets
from torchvision import transforms,datasets # mnist
from torchsummary import summary

In [4]:
import pandas as pd
import numpy as np 
import os 
from PIL import Image
import matplotlib.pyplot as plt
import re
import warnings
warnings.filterwarnings('ignore')
import collections
import pickle
from tqdm import tqdm

In [2]:
from tensorflow.keras.utils import pad_sequences,to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer

In [5]:
# Display all rows
pd.set_option('display.max_rows', None)

# Display all columns
pd.set_option('display.max_columns', None)

# Set display width to avoid truncation
pd.set_option('display.width', None)

# Getting things setup

In [7]:
# check for gpu presence
device = "cuda" if torch.cuda.is_available() else "cpu"
name = torch.cuda.get_device_name(device=None)
print(f'cuda present: {device}\nname: {name}')

cuda present: cuda
name: NVIDIA GeForce MX330


In [8]:
# setting up the path for the data
# BASEDIR = 'C:\\Users\\naman\\Downloads\\archive\\flickr30k_images'
# data_path =  os.path.join(BASEDIR,'flickr30k_images')

In [9]:
# loading the directory of the trail dataset
TEST_DIR = 'C:\\Python course\\Major\\Trail_Dataset'

In [21]:
transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5,0.5,0.5],std=[0.5,0.5,0.5])
])

In [49]:
img_name  = []
class Image_Data_Generator(Dataset):
    def __init__(self,directory,transform=None):
        self.directory = directory
        self.transform = transform
        self.image_files = [f for f in os.listdir(directory) if f.endswith(('.png','.jpg','.jpeg'))]
        for f in self.image_files:
            f = f[:-4]
            if f not in img_name:
                img_name.append(f)
    
    def __len__(self):
        return len(self.image_files)

    def __getitem__(self,idx):
        self.image_path = os.path.join(self.directory,self.image_files[idx])
        image = Image.open(self.image_path)
        if self.transform:
            image = self.transform(image)
        return image,0

In [52]:
data = Image_Data_Generator(directory=TEST_DIR,transform=transform)

In [53]:
print(len(data))

299


In [54]:
data = DataLoader(data, batch_size=300)

# APPLY RESNET 

In [55]:
resnet = models.resnet50(pretrained=True).to(device)
resnet = nn.Sequential(*(list(resnet.children())[:-1]))
summary(resnet,input_size=(3,224,224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 112, 112]           9,408
       BatchNorm2d-2         [-1, 64, 112, 112]             128
              ReLU-3         [-1, 64, 112, 112]               0
         MaxPool2d-4           [-1, 64, 56, 56]               0
            Conv2d-5           [-1, 64, 56, 56]           4,096
       BatchNorm2d-6           [-1, 64, 56, 56]             128
              ReLU-7           [-1, 64, 56, 56]               0
            Conv2d-8           [-1, 64, 56, 56]          36,864
       BatchNorm2d-9           [-1, 64, 56, 56]             128
             ReLU-10           [-1, 64, 56, 56]               0
           Conv2d-11          [-1, 256, 56, 56]          16,384
      BatchNorm2d-12          [-1, 256, 56, 56]             512
           Conv2d-13          [-1, 256, 56, 56]          16,384
      BatchNorm2d-14          [-1, 256,

In [56]:
resnet.eval()
features = []
with torch.no_grad():
    for images,_ in data:
        images = images.to(device)
        output = resnet(images)
        output = output.view(output.size(0),-1)
        features.append(output.cpu())
        

In [64]:
features[0][0]

tensor([0.6545, 0.3312, 0.2800,  ..., 0.3515, 0.5427, 0.5932])

# APPLY EFFICIENT NET

In [58]:
# efficient_net = models.efficientnet_b7(pretrained=True).to(device)
# efficient_net = nn.Sequential(*(list(efficient_net.children()))[-1])

In [59]:
# efficient_net.eval()
# features = []
# with torch.no_grad():
#     for images,_ in data:
#         images = images.to(device)
#         output = efficient_net(images)
#         output = output.view(output.size(0),-1)
#         features.append(output.cpu())

In [60]:
# features[0]

In [65]:
img_ids = img_name

image_features = {img_id : features[0][i] for i, img_id in enumerate(img_ids)}

In [66]:
for i,feat in image_features.items():
    print(f'Image_ID:{i} and features: {feat}')

Image_ID:872135364 and features: tensor([0.6545, 0.3312, 0.2800,  ..., 0.3515, 0.5427, 0.5932])
Image_ID:872512911 and features: tensor([0.3645, 0.4464, 0.8144,  ..., 0.1322, 0.1226, 0.5954])
Image_ID:872615435 and features: tensor([0.4927, 1.3444, 0.4861,  ..., 0.0331, 0.8065, 0.3060])
Image_ID:872622575 and features: tensor([0.3444, 0.5633, 0.2221,  ..., 0.4542, 0.3199, 0.3661])
Image_ID:873633312 and features: tensor([0.0311, 0.2591, 0.9242,  ..., 0.0113, 0.1411, 0.8619])
Image_ID:873650807 and features: tensor([0.7056, 0.7908, 0.0271,  ..., 0.0160, 0.0819, 0.0854])
Image_ID:873862583 and features: tensor([0.3250, 0.4158, 0.2097,  ..., 0.4070, 0.0729, 0.2421])
Image_ID:87388323 and features: tensor([0.6980, 0.5433, 1.1830,  ..., 0.0593, 0.9855, 0.7441])
Image_ID:873933926 and features: tensor([0.4178, 0.9004, 0.2047,  ..., 0.0322, 0.2809, 0.4885])
Image_ID:874665322 and features: tensor([0.6357, 0.3415, 0.3059,  ..., 0.3191, 0.6704, 0.4302])
Image_ID:874736612 and features: tensor([

In [67]:
#store the features in the pickl file
pickle.dump(features,open(os.path.join(os.getcwd(),'features.pkl'),'wb'))

with open(os.path.join(os.getcwd(),'features.pkl'),'rb') as f:
    features = pickle.load(f)

# WORKING ON THE CAPTION DATA

In [68]:
content = {}
file = open("C:\\Python course\\Major\\result.txt","r")
all_text = file.readlines()

In [69]:
def clean_string(text):
    text = text.lower()
    text = re.sub('[\n\.]','',text)
    text = re.sub("[^A-Za-z]+"," ",text)
    text = '<start> '+ ' '.join([word for word in text.split() if len(word)>1]) +' <end>'
    text = text.strip()
    return text

In [70]:
content_dictionary = {}
for text in all_text:
    text = text.split("|")
    if text[0].endswith('.jpg'):
        temp_list=[]
        if text[0][:-4] not in content_dictionary:
            clean_text = clean_string(text[-1]) 
            temp_list.append(clean_text)
            content_dictionary[text[0][:-4]] = temp_list
        else:
            clean_text = clean_string(text[-1])
            content_dictionary[text[0][:-4]].append(clean_text)

In [71]:
content_dictionary = {img_id:content for img_id,content in content_dictionary.items() if img_id in img_ids}

In [72]:
content_dictionary

{'872135364': ['<start> big gray dog wearing chain collar has smaller brown dog pinned down <end>',
  '<start> the mastif playfully pins the german shepherd in the grass <end>',
  '<start> black dog stands on top of brown dog on the grass <end>',
  '<start> two dogs are wrestling in grassy field <end>',
  '<start> brown and black dog tussle on grass <end>'],
 '872512911': ['<start> large group of people most setting down on walkway with two boys standing up holding plates <end>',
  '<start> two men one with purple hair at crowded fountain near seattle space needle <end>',
  '<start> people sit by fountain guy with purple hair talks to guy in hat <end>',
  '<start> many people are sitting and observing or going into water attraction <end>',
  '<start> many people sit or stand around the fountain near the space needle <end>'],
 '872615435': ['<start> teen girl in green plaid shirt with white shirt underneath and jeans is jumping in the air <end>',
  '<start> an awesome picture of young w

In [73]:
all_cap = []
for i in content_dictionary:
    for cap in content_dictionary[i]:
        all_cap.append(cap)

print(len(all_cap))

1495


In [74]:
total_word = []
for i,word in content_dictionary.items():
    for w in word:
        for j in w.split():
            total_word.append(j)

In [75]:
frequency_data = dict(collections.Counter(total_word))
print(frequency_data)

{'<start>': 1495, 'big': 11, 'gray': 18, 'dog': 89, 'wearing': 130, 'chain': 1, 'collar': 6, 'has': 23, 'smaller': 1, 'brown': 42, 'pinned': 1, 'down': 65, '<end>': 1495, 'the': 587, 'mastif': 1, 'playfully': 1, 'pins': 1, 'german': 3, 'shepherd': 3, 'in': 718, 'grass': 24, 'black': 98, 'stands': 25, 'on': 403, 'top': 25, 'of': 365, 'two': 185, 'dogs': 12, 'are': 172, 'wrestling': 3, 'grassy': 6, 'field': 20, 'and': 397, 'tussle': 1, 'large': 51, 'group': 79, 'people': 148, 'most': 1, 'setting': 5, 'walkway': 4, 'with': 339, 'boys': 17, 'standing': 83, 'up': 43, 'holding': 41, 'plates': 2, 'men': 95, 'one': 31, 'purple': 8, 'hair': 26, 'at': 153, 'crowded': 8, 'fountain': 3, 'near': 36, 'seattle': 1, 'space': 2, 'needle': 3, 'sit': 25, 'by': 54, 'guy': 16, 'talks': 5, 'to': 126, 'hat': 42, 'many': 10, 'sitting': 87, 'observing': 4, 'or': 11, 'going': 5, 'into': 48, 'water': 89, 'attraction': 1, 'stand': 27, 'around': 28, 'teen': 1, 'girl': 88, 'green': 41, 'plaid': 5, 'shirt': 135, 'wh

In [76]:
'''
    Finally we have 2 dictionary 
    image_features --> this has your image features in 2048 vector
    content_dictionary ---> this has your image caption data
'''

'\n    Finally we have 2 dictionary \n    image_features --> this has your image features in 2048 vector\n    content_dictionary ---> this has your image caption data\n'

In [77]:
# TOKENIZE the test
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_cap)
vocab_size = len(tokenizer.word_index)+1
print(vocab_size)

2085


In [78]:
# getting the maximum length of the caption --  will we required during model building
max_length = max(len(cap.split()) for cap in all_cap)
max_length

38

# Train Test Split

In [79]:
split_ratio = int(len(img_ids)*0.9)
train = img_ids[:split_ratio]
test = img_ids[split_ratio:]

The following function is CHATGPT, CLAUDE AND LITTLE BIT OF DEEPSEEK so, don't ask much about it. 
I do have a basic idea how this works

In [80]:
import tensorflow as tf
import numpy as np

def datagenerator(data_keys, mapping, features, tokenizer, max_length, vocab_size, batch_size):
    def generator():
        X1, X2, y = [], [], []
        n = 0
        while True:
            for key in data_keys:
                captions = mapping[key]
                for caption in captions:
                    seq = tokenizer.texts_to_sequences([caption])[0]
                    for i in range(1, len(seq)):
                        in_seq, out_seq = seq[:i], seq[i]
                        in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                        out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                        
                        # Convert tensor to numpy array if needed
                        feature = features[key].numpy() if hasattr(features[key], 'numpy') else features[key]
                        feature = feature.reshape(1, -1)  # Reshape to (1, 2048)
                        
                        X1.append(feature)
                        X2.append(in_seq)
                        y.append(out_seq)
                        n += 1
                        
                        if n == batch_size:
                            # Stack the batches
                            X1_batch = np.vstack(X1)  # Shape: (batch_size, 2048)
                            X2_batch = np.array(X2)   # Shape: (batch_size, max_length)
                            y_batch = np.array(y)     # Shape: (batch_size, vocab_size)
                            
                            # Reset lists and counter
                            X1, X2, y = [], [], []
                            n = 0
                            
                            yield (X1_batch, X2_batch), y_batch
    
    # Define output signature
    output_signature = (
        (
            tf.TensorSpec(shape=(None, 2048), dtype=tf.float32),
            tf.TensorSpec(shape=(None, max_length), dtype=tf.int32)
        ),
        tf.TensorSpec(shape=(None, vocab_size), dtype=tf.float32)
    )
    
    dataset = tf.data.Dataset.from_generator(
        generator,
        output_signature=output_signature
    )
    
    return dataset

#  LSTM FOR CAPTION GENERATION

In [81]:
import tensorflow as tf
from tensorflow.keras import Input
from tensorflow.keras.layers import Dense,LSTM,Dropout,Embedding,add
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.utils import plot_model
from tensorflow.keras.optimizers import Adam

In [82]:
# Encoder model 
# Image Feature layer
input1 = Input(shape=(2048,))
fe1 = Dropout(0.4)(input1)
fe2 = Dense(256, activation='relu')(fe1)
# sequence feature layer
input2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size,256,mask_zero=True)(input2)
se2 = Dropout(0.4)(se1)
se3 = LSTM(256)(se2)

# Decoder model
decoder1 = add([fe2,se3])
decoder2 = Dense(256,activation='relu')(decoder1)
outputs = Dense(vocab_size,activation='softmax')(decoder2)

model = Model([input1,input2],outputs = outputs)
model.compile(loss = 'categorical_crossentropy', optimizer=Adam())
model.summary()

In [84]:
# Train the model
epoch = 10
batch_size = 2
steps = len(train) // batch_size

dataset = datagenerator(
    data_keys=img_ids,
    mapping=content_dictionary,
    features=image_features,
    tokenizer=tokenizer,
    max_length=max_length,
    vocab_size=vocab_size,
    batch_size=batch_size
)

for (x1_batch, x2_batch), y_batch in dataset.take(1):
    print("X1 batch shape:", x1_batch.shape)
    print("X2 batch shape:", x2_batch.shape)
    print("y batch shape:", y_batch.shape)

for i in range(epoch):
    model.fit(dataset, epochs=1, steps_per_epoch=steps,verbose=1)

X1 batch shape: (2, 2048)
X2 batch shape: (2, 38)
y batch shape: (2, 2085)
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 48ms/step - loss: 6.9619
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 45ms/step - loss: 4.4347
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 51ms/step - loss: 4.3721
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 43ms/step - loss: 4.4703
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 41ms/step - loss: 4.2024
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 39ms/step - loss: 4.3656
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 38ms/step - loss: 3.9729
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 38ms/step - loss: 3.6514
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 39ms/step - loss: 3.2882
[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 39ms/step - loss: 3.0016


In [85]:
model.save('best_model.h5')



# TESTING THE MODEL

In [86]:
def idx_to_word(integer,tokenizer):
    for word, index in tokenizer.word_index.items():
        if index==integer:
            return word
        return None

In [87]:
#generate caption for an image
def predict_caption_off(model,image,tokenizer,max_length):
    if not isinstance(image, np.ndarray):
        image = image.numpy()
    # image = image.reshape(1, -1)
    image = np.expand_dims(image, axis=0)
    in_text = '<start>'
    for i in range(max_length):
        # encode input sequence 
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        # pad the sequence
        sequence = pad_sequences([sequence],max_length)
        # predict the next word
        yhat = model.predict([image,sequence],verbose=0)
        # convert or get index with high probability
        yhat = np.argmax(yhat)
        # convert index to word
        word = idx_to_word(yhat,tokenizer)
        if word is None:
            break
        # append
        in_text += " " + word
        if word == '<end>':
            break
    return in_text

In [88]:
# validate with test data
actual, predicted = [], []

for key in tqdm(test):
    captions = content_dictionary[key]
    y_pred = predict_caption_off(model,image_features[key],tokenizer,max_length)
    # actual_caption = [caption.split() for caption in captions]
    # y_pred=y_pred.split()
    # actual.append(actual_caption)
    # predicted.append(y_pred)
    actual_captions = []
    for caption in captions:
        # Clean the caption
        cleaned_caption = caption.replace('<start>', '').replace('<end>', '').strip()
        actual_captions.append(cleaned_caption.split())
    
    # Clean predicted caption
    pred_caption = y_pred.replace('<start>', '').replace('<end>', '').strip().split()
    
    actual.append(actual_captions)
    predicted.append(pred_caption)



100%|██████████| 30/30 [00:02<00:00, 12.01it/s]


In [89]:
predicted

[[],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 []]