In [2]:
import torch 
import torch.nn as nn
import torchvision.models as models
from torch.utils.data import DataLoader,Dataset # custom datasets
from torchvision import transforms,datasets # mnist

In [39]:
import re
import warnings
warnings.filterwarnings('ignore')
import collections

In [4]:
import pandas as pd
import numpy as np 
import os 
from PIL import Image
import matplotlib.pyplot as plt
from torchsummary import summary

In [5]:
# Display all rows
pd.set_option('display.max_rows', None)

# Display all columns
pd.set_option('display.max_columns', None)

# Set display width to avoid truncation
pd.set_option('display.width', None)

# Getting things setup

In [6]:
# check for gpu presence
device = "cuda" if torch.cuda.is_available() else "cpu"
name = torch.cuda.get_device_name(device=None)
print(f'cuda present: {device}\nname: {name}')

cuda present: cuda
name: NVIDIA GeForce MX330


In [7]:
# setting up the path for the data
# BASEDIR = 'C:\\Users\\naman\\Downloads\\archive\\flickr30k_images'
# data_path =  os.path.join(BASEDIR,'flickr30k_images')

In [8]:
# loading the directory of the trail dataset
TEST_DIR = 'C:\\Python course\\Major\\Trail_Dataset'

In [9]:
transform = transforms.Compose([
    transforms.Resize((600,600)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5,0.5,0.5],std=[0.5,0.5,0.5])
])

In [10]:
img_name  = []
class Image_Data_Generator(Dataset):
    def __init__(self,directory,transform=None):
        self.directory = directory
        self.transform = transform
        self.image_files = [f for f in os.listdir(directory) if f.endswith(('.png','.jpg','.jpeg'))]
        for f in self.image_files:
            f = f[:-4]
            if f not in img_name:
                img_name.append(f)
    
    def __len__(self):
        return len(self.image_files)

    def __getitem__(self,idx):
        self.image_path = os.path.join(self.directory,self.image_files[idx])
        image = Image.open(self.image_path)
        if self.transform:
            image = self.transform(image)
        return image,0

In [11]:
data = Image_Data_Generator(directory=TEST_DIR,transform=transform)

In [12]:
print(len(data))

21


In [13]:
data = DataLoader(data, batch_size=64, shuffle=True)

# APPLY RESNET 

In [14]:
resnet = models.resnet50(pretrained=True).to(device)
resnet = nn.Sequential(*(list(resnet.children())[:-1]))
summary(resnet,input_size=(3,224,224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 112, 112]           9,408
       BatchNorm2d-2         [-1, 64, 112, 112]             128
              ReLU-3         [-1, 64, 112, 112]               0
         MaxPool2d-4           [-1, 64, 56, 56]               0
            Conv2d-5           [-1, 64, 56, 56]           4,096
       BatchNorm2d-6           [-1, 64, 56, 56]             128
              ReLU-7           [-1, 64, 56, 56]               0
            Conv2d-8           [-1, 64, 56, 56]          36,864
       BatchNorm2d-9           [-1, 64, 56, 56]             128
             ReLU-10           [-1, 64, 56, 56]               0
           Conv2d-11          [-1, 256, 56, 56]          16,384
      BatchNorm2d-12          [-1, 256, 56, 56]             512
           Conv2d-13          [-1, 256, 56, 56]          16,384
      BatchNorm2d-14          [-1, 256,

In [15]:
resnet.eval()
features = []
with torch.no_grad():
    for images,_ in data:
        images = images.to(device)
        output = resnet(images)
        output = output.view(output.size(0),-1)
        features.append(output.cpu())
        

In [16]:
features

[tensor([[0.1053, 0.9231, 0.3318,  ..., 0.0904, 0.1563, 0.1781],
         [0.2849, 0.4690, 0.5033,  ..., 0.1356, 0.2589, 0.3747],
         [0.1355, 0.1182, 0.3567,  ..., 0.0315, 0.0560, 0.1500],
         ...,
         [0.1240, 0.5719, 0.1252,  ..., 0.0340, 0.0432, 0.1434],
         [0.1783, 0.5770, 0.3486,  ..., 0.0658, 0.0317, 0.2162],
         [0.0741, 0.3467, 0.1679,  ..., 0.0286, 0.0853, 0.0655]])]

# APPLY EFFICIENT NET

In [17]:
efficient_net = models.efficientnet_b7(pretrained=True).to(device)
efficient_net = nn.Sequential(*(list(efficient_net.children()))[-1])

In [18]:
efficient_net.eval()
features = []
with torch.no_grad():
    for images,_ in data:
        images = images.to(device)
        output = output.view(output.size(0),-1)
        features.append(output.cpu())

In [19]:
features[0]

tensor([[0.1053, 0.9231, 0.3318,  ..., 0.0904, 0.1563, 0.1781],
        [0.2849, 0.4690, 0.5033,  ..., 0.1356, 0.2589, 0.3747],
        [0.1355, 0.1182, 0.3567,  ..., 0.0315, 0.0560, 0.1500],
        ...,
        [0.1240, 0.5719, 0.1252,  ..., 0.0340, 0.0432, 0.1434],
        [0.1783, 0.5770, 0.3486,  ..., 0.0658, 0.0317, 0.2162],
        [0.0741, 0.3467, 0.1679,  ..., 0.0286, 0.0853, 0.0655]])

In [20]:
img_ids = img_name

image_features = {img_id : features[0][i] for i, img_id in enumerate(img_ids)}

In [21]:
for i,feat in image_features.items():
    print(f'Image_ID:{i} and features: {feat}')

Image_ID:985982384 and features: tensor([0.1053, 0.9231, 0.3318,  ..., 0.0904, 0.1563, 0.1781])
Image_ID:986127455 and features: tensor([0.2849, 0.4690, 0.5033,  ..., 0.1356, 0.2589, 0.3747])
Image_ID:986440271 and features: tensor([0.1355, 0.1182, 0.3567,  ..., 0.0315, 0.0560, 0.1500])
Image_ID:987442144 and features: tensor([0.2363, 0.1451, 1.1998,  ..., 0.1033, 0.2156, 0.1499])
Image_ID:98756067 and features: tensor([0.1110, 0.3214, 0.0895,  ..., 0.0908, 0.1907, 0.2727])
Image_ID:98756125 and features: tensor([0.1387, 0.2314, 0.1611,  ..., 0.0317, 0.0471, 0.1555])
Image_ID:98773047 and features: tensor([0.2167, 0.4588, 0.1797,  ..., 0.0921, 0.2046, 0.1184])
Image_ID:987907964 and features: tensor([0.1083, 0.0819, 0.2008,  ..., 0.3558, 0.2337, 0.1420])
Image_ID:98817947 and features: tensor([0.1727, 0.4189, 0.7477,  ..., 0.1516, 0.2114, 0.2324])
Image_ID:98885561 and features: tensor([0.2566, 0.6387, 0.5218,  ..., 0.0476, 0.0831, 0.1915])
Image_ID:98944492 and features: tensor([0.142

# WORKING ON THE CAPTION DATA

In [22]:
content = {}
file = open("C:\\Python course\\Major\\result.txt","r")
all_text = file.readlines()

In [23]:
def clean_string(text):
    text = text.lower()
    text = re.sub('[\n\.]','',text)
    text = re.sub("[^a-z]+"," ",text)
    text = text.strip()
    return text

In [24]:
content_dictionary = {}
for text in all_text:
    text = text.split("|")
    if text[0].endswith('.jpg'):
        temp_list=[]
        if text[0][:-4] not in content_dictionary:
            clean_text = clean_string(text[-1]) 
            temp_list.append(clean_text)
            content_dictionary[text[0][:-4]] = temp_list
        else:
            clean_text = clean_string(text[-1])
            content_dictionary[text[0][:-4]].append(clean_text)

In [25]:
content_dictionary = {img_id:content for img_id,content in content_dictionary.items() if img_id in img_ids}

In [26]:
content_dictionary

{'985982384': ['some women and men are sun tanning and watching the ocean waves on a bunch of rocks',
  'two women relaxing while two men have a conversation on a rock',
  'a group of sunbathers lies on the rocks on towels and blankets',
  'men and women in swimsuits hangout on rocks above water',
  'group of sunbathers laying on the rocks'],
 '986127455': ['man walking down the street is wearing a black suit and carrying a small white bag',
  'a person with a bag walking in a big city',
  'people walking a standing in a city park',
  'a woman in a black suit is walking by',
  'a person is walking with a white bag'],
 '986440271': ['a young woman in a red flowered dress and a multicolored umbrella is taking a picture of something and there is a large tree in the background',
  'a woman in a red floral dress with cardigan takes a photo on a nature trail while holding a rainbow umbrella',
  'a woman with a multicolored umbrella takes a picture on a road surrounded by trees',
  'a woman w

In [37]:
total_word = []
for i,word in content_dictionary.items():
    for w in word:
        for j in w.split():
            total_word.append(j)
            

In [43]:
frequency_data = dict(collections.Counter(total_word))
print(frequency_data)

{'some': 2, 'women': 3, 'and': 30, 'men': 11, 'are': 17, 'sun': 1, 'tanning': 1, 'watching': 1, 'the': 45, 'ocean': 4, 'waves': 2, 'on': 41, 'a': 166, 'bunch': 1, 'of': 23, 'rocks': 4, 'two': 12, 'relaxing': 2, 'while': 5, 'have': 1, 'conversation': 1, 'rock': 1, 'group': 7, 'sunbathers': 2, 'lies': 1, 'towels': 1, 'blankets': 1, 'in': 51, 'swimsuits': 1, 'hangout': 1, 'above': 2, 'water': 3, 'laying': 1, 'man': 23, 'walking': 9, 'down': 8, 'street': 7, 'is': 24, 'wearing': 7, 'black': 9, 'suit': 2, 'carrying': 3, 'small': 2, 'white': 8, 'bag': 3, 'person': 10, 'with': 25, 'big': 1, 'city': 4, 'people': 14, 'standing': 8, 'park': 2, 'woman': 16, 'by': 5, 'young': 6, 'red': 4, 'flowered': 1, 'dress': 2, 'multicolored': 2, 'umbrella': 5, 'taking': 3, 'picture': 5, 'something': 1, 'there': 2, 'large': 3, 'tree': 1, 'background': 1, 'floral': 1, 'cardigan': 1, 'takes': 4, 'photo': 1, 'nature': 1, 'trail': 1, 'holding': 2, 'rainbow': 2, 'road': 2, 'surrounded': 3, 'trees': 2, 'sanding': 1, 

In [103]:
'''
    Finally we have 2 dictionary 
    image_features --> this has your image features in 2048 vector
    content_dictionary ---> this has your image caption data
'''

'\n    Finally we have 2 dictionary \n    image_features --> this has your image features in 2048 vector\n    content_dictionary ---> this has your image caption data\n'

# TIME TO BUILD A LSTM FOR CAPTION GENERATION

In [105]:
import tensorflow as tf 
from tensorflow.keras.layers import Dense,LSTM
from tensorflow.keras.models import Sequential