In [77]:
import torch 
import torch.nn as nn
import torchvision.models as models
from torch.utils.data import DataLoader,Dataset # custom datasets
from torchvision import transforms,datasets # mnist

In [78]:
import re
import warnings
warnings.filterwarnings('ignore')

In [79]:
import pandas as pd
import numpy as np 
import os 
from PIL import Image
import matplotlib.pyplot as plt
from torchsummary import summary

In [80]:
# Display all rows
pd.set_option('display.max_rows', None)

# Display all columns
pd.set_option('display.max_columns', None)

# Set display width to avoid truncation
pd.set_option('display.width', None)

# Getting things setup

In [81]:
# check for gpu presence
device = "cuda" if torch.cuda.is_available() else "cpu"
name = torch.cuda.get_device_name(device=None)
print(f'cuda present: {device}\nname: {name}')

cuda present: cuda
name: NVIDIA GeForce MX330


In [82]:
# setting up the path for the data
# BASEDIR = 'C:\\Users\\naman\\Downloads\\archive\\flickr30k_images'
# data_path =  os.path.join(BASEDIR,'flickr30k_images')

In [83]:
# loading the directory of the trail dataset
TEST_DIR = 'C:\\Python course\\Major\\Trail_Dataset'

In [84]:
transform = transforms.Compose([
    transforms.Resize((600,600)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5,0.5,0.5],std=[0.5,0.5,0.5])
])

In [85]:
img_name  = []
class Image_Data_Generator(Dataset):
    def __init__(self,directory,transform=None):
        self.directory = directory
        self.transform = transform
        self.image_files = [f for f in os.listdir(directory) if f.endswith(('.png','.jpg','.jpeg'))]
        for f in self.image_files:
            f = f[:-4]
            if f not in img_name:
                img_name.append(f)
    
    def __len__(self):
        return len(self.image_files)

    def __getitem__(self,idx):
        self.image_path = os.path.join(self.directory,self.image_files[idx])
        image = Image.open(self.image_path)
        if self.transform:
            image = self.transform(image)
        return image,0

In [86]:
data = Image_Data_Generator(directory=TEST_DIR,transform=transform)

In [87]:
print(len(data))

21


In [88]:
data = DataLoader(data, batch_size=64, shuffle=True)

# APPLY RESNET 

In [89]:
resnet = models.resnet50(pretrained=True).to(device)
resnet = nn.Sequential(*(list(resnet.children())[:-1]))
summary(resnet,input_size=(3,224,224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 112, 112]           9,408
       BatchNorm2d-2         [-1, 64, 112, 112]             128
              ReLU-3         [-1, 64, 112, 112]               0
         MaxPool2d-4           [-1, 64, 56, 56]               0
            Conv2d-5           [-1, 64, 56, 56]           4,096
       BatchNorm2d-6           [-1, 64, 56, 56]             128
              ReLU-7           [-1, 64, 56, 56]               0
            Conv2d-8           [-1, 64, 56, 56]          36,864
       BatchNorm2d-9           [-1, 64, 56, 56]             128
             ReLU-10           [-1, 64, 56, 56]               0
           Conv2d-11          [-1, 256, 56, 56]          16,384
      BatchNorm2d-12          [-1, 256, 56, 56]             512
           Conv2d-13          [-1, 256, 56, 56]          16,384
      BatchNorm2d-14          [-1, 256,

In [90]:
resnet.eval()
features = []
with torch.no_grad():
    for images,_ in data:
        images = images.to(device)
        output = resnet(images)
        output = output.view(output.size(0),-1)
        features.append(output.cpu())
        

In [91]:
features

[tensor([[0.1023, 0.0851, 0.2017,  ..., 0.3507, 0.2399, 0.1335],
         [0.1768, 0.2959, 0.7434,  ..., 0.0290, 0.1907, 0.1949],
         [0.1676, 0.4160, 0.7674,  ..., 0.1542, 0.2161, 0.2272],
         ...,
         [0.1705, 0.4698, 0.5936,  ..., 0.1496, 0.1899, 0.0839],
         [0.2335, 0.1446, 1.1984,  ..., 0.0966, 0.2206, 0.1431],
         [0.1876, 0.5097, 0.4374,  ..., 0.0830, 0.1382, 0.2505]])]

# APPLY EFFICIENT NET

In [92]:
efficient_net = models.efficientnet_b7(pretrained=True).to(device)
efficient_net = nn.Sequential(*(list(efficient_net.children()))[-1])

In [93]:
efficient_net.eval()
features = []
with torch.no_grad():
    for images,_ in data:
        images = images.to(device)
        output = output.view(output.size(0),-1)
        features.append(output.cpu())

In [94]:
features[0]

tensor([[0.1023, 0.0851, 0.2017,  ..., 0.3507, 0.2399, 0.1335],
        [0.1768, 0.2959, 0.7434,  ..., 0.0290, 0.1907, 0.1949],
        [0.1676, 0.4160, 0.7674,  ..., 0.1542, 0.2161, 0.2272],
        ...,
        [0.1705, 0.4698, 0.5936,  ..., 0.1496, 0.1899, 0.0839],
        [0.2335, 0.1446, 1.1984,  ..., 0.0966, 0.2206, 0.1431],
        [0.1876, 0.5097, 0.4374,  ..., 0.0830, 0.1382, 0.2505]])

In [95]:
img_ids = img_name

image_features = {img_id : features[0][i] for i, img_id in enumerate(img_ids)}

In [96]:
for i,feat in image_features.items():
    print(f'Image_ID:{i} and features: {feat}')

Image_ID:985982384 and features: tensor([0.1023, 0.0851, 0.2017,  ..., 0.3507, 0.2399, 0.1335])
Image_ID:986127455 and features: tensor([0.1768, 0.2959, 0.7434,  ..., 0.0290, 0.1907, 0.1949])
Image_ID:986440271 and features: tensor([0.1676, 0.4160, 0.7674,  ..., 0.1542, 0.2161, 0.2272])
Image_ID:987442144 and features: tensor([0.1674, 0.5808, 0.3514,  ..., 0.0687, 0.0309, 0.2093])
Image_ID:98756067 and features: tensor([0.1296, 0.4659, 0.6036,  ..., 0.0456, 0.1301, 0.2771])
Image_ID:98756125 and features: tensor([0.2528, 0.6382, 0.5387,  ..., 0.0496, 0.0828, 0.1820])
Image_ID:98773047 and features: tensor([0.2080, 0.2753, 0.2509,  ..., 0.0487, 0.3861, 0.1570])
Image_ID:987907964 and features: tensor([0.1024, 0.9110, 0.3357,  ..., 0.0902, 0.1611, 0.1709])
Image_ID:98817947 and features: tensor([0.1434, 0.1556, 0.6955,  ..., 0.0188, 0.1985, 0.1387])
Image_ID:98885561 and features: tensor([0.1476, 0.3742, 0.5345,  ..., 0.0282, 0.2009, 0.1559])
Image_ID:98944492 and features: tensor([0.090

# WORKING ON THE CAPTION DATA

In [97]:
content = {}
file = open("C:\\Python course\\Major\\result.txt","r")
all_text = file.readlines()

In [98]:
def clean_string(text):
    text = text.lower()
    text = re.sub('[\n\.]','',text)
    text = re.sub("[^a-z]+"," ",text)
    text = text.strip()
    return text

In [99]:
content_dictionary = {}
for text in all_text:
    text = text.split("|")
    if text[0].endswith('.jpg'):
        temp_list=[]
        if text[0][:-4] not in content_dictionary:
            clean_text = clean_string(text[-1]) 
            temp_list.append(clean_text)
            content_dictionary[text[0][:-4]] = temp_list
        else:
            clean_text = clean_string(text[-1])
            content_dictionary[text[0][:-4]].append(clean_text)

In [100]:
content_dictionary = {img_id:content for img_id,content in content_dictionary.items() if img_id in img_ids}

In [101]:
content_dictionary

{'985982384': ['some women and men are sun tanning and watching the ocean waves on a bunch of rocks',
  'two women relaxing while two men have a conversation on a rock',
  'a group of sunbathers lies on the rocks on towels and blankets',
  'men and women in swimsuits hangout on rocks above water',
  'group of sunbathers laying on the rocks'],
 '986127455': ['man walking down the street is wearing a black suit and carrying a small white bag',
  'a person with a bag walking in a big city',
  'people walking a standing in a city park',
  'a woman in a black suit is walking by',
  'a person is walking with a white bag'],
 '986440271': ['a young woman in a red flowered dress and a multicolored umbrella is taking a picture of something and there is a large tree in the background',
  'a woman in a red floral dress with cardigan takes a photo on a nature trail while holding a rainbow umbrella',
  'a woman with a multicolored umbrella takes a picture on a road surrounded by trees',
  'a woman w

In [103]:
'''
    Finally we have 2 dictionary 
    image_features --> this has your image features in 2048 vector
    content_dictionary ---> this has your image caption data
'''

'\n    Finally we have 2 dictionary \n    image_features --> this has your image features in 2048 vector\n    content_dictionary ---> this has your image caption data\n'

# TIME TO BUILD A LSTM FOR CAPTION GENERATION

In [105]:
import tensorflow as tf 
from tensorflow.keras.layers import Dense,LSTM
from tensorflow.keras.models import Sequential