In [1]:
import torch 
import torch.nn as nn
import torchvision.models as models
from torch.utils.data import DataLoader,Dataset # custom datasets
from torchvision import transforms,datasets # mnist

In [2]:
import re
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
import numpy as np 
import os 
from PIL import Image
import matplotlib.pyplot as plt
from torchsummary import summary

In [4]:
# Display all rows
pd.set_option('display.max_rows', None)

# Display all columns
pd.set_option('display.max_columns', None)

# Set display width to avoid truncation
pd.set_option('display.width', None)

# Getting things setup

In [5]:
# check for gpu presence
device = "cuda" if torch.cuda.is_available() else "cpu"
name = torch.cuda.get_device_name(device=None)
print(f'cuda present: {device}\nname: {name}')

cuda present: cuda
name: NVIDIA GeForce MX330


In [6]:
# setting up the path for the data
# BASEDIR = 'C:\\Users\\naman\\Downloads\\archive\\flickr30k_images'
# data_path =  os.path.join(BASEDIR,'flickr30k_images')

In [7]:
# loading the directory of the trail dataset
TEST_DIR = 'C:\\Python course\\Major\\Trail_Dataset'

In [8]:
transform = transforms.Compose([
    transforms.Resize((600,600)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5,0.5,0.5],std=[0.5,0.5,0.5])
])

In [9]:
class Image_Data_Generator(Dataset):
    def __init__(self,directory,transform=None):
        self.directory = directory
        self.transform = transform
        self.image_files = [f for f in os.listdir(directory) if f.endswith(('.png','.jpg','.jpeg'))]
    
    def __len__(self):
        return len(self.image_files)

    def __getitem__(self,idx):
        self.image_path = os.path.join(self.directory,self.image_files[idx])
        image = Image.open(self.image_path)
        if self.transform:
            image = self.transform(image)
        return image,0

In [10]:
data = Image_Data_Generator(directory=TEST_DIR,transform=transform)

In [11]:
print(len(data))

21


In [12]:
data = DataLoader(data, batch_size=64, shuffle=True)

# APPLY RESNET 

In [13]:
resnet = models.resnet50(pretrained=True).to(device)
resnet = nn.Sequential(*(list(resnet.children())[:-1]))
summary(resnet,input_size=(3,224,224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 112, 112]           9,408
       BatchNorm2d-2         [-1, 64, 112, 112]             128
              ReLU-3         [-1, 64, 112, 112]               0
         MaxPool2d-4           [-1, 64, 56, 56]               0
            Conv2d-5           [-1, 64, 56, 56]           4,096
       BatchNorm2d-6           [-1, 64, 56, 56]             128
              ReLU-7           [-1, 64, 56, 56]               0
            Conv2d-8           [-1, 64, 56, 56]          36,864
       BatchNorm2d-9           [-1, 64, 56, 56]             128
             ReLU-10           [-1, 64, 56, 56]               0
           Conv2d-11          [-1, 256, 56, 56]          16,384
      BatchNorm2d-12          [-1, 256, 56, 56]             512
           Conv2d-13          [-1, 256, 56, 56]          16,384
      BatchNorm2d-14          [-1, 256,

In [14]:
resnet.eval()
features = []
with torch.no_grad():
    for images,_ in data:
        images = images.to(device)
        output = resnet(images)
        output = output.view(output.size(0),-1)
        features.append(output.cpu())

In [15]:
features

[tensor([[0.1475, 0.3853, 0.5458,  ..., 0.0289, 0.1857, 0.1674],
         [0.0934, 0.3153, 0.2459,  ..., 0.1266, 0.2304, 0.1550],
         [0.2236, 0.4763, 0.1889,  ..., 0.0886, 0.2014, 0.1247],
         ...,
         [0.0757, 0.3484, 0.1690,  ..., 0.0304, 0.0830, 0.0665],
         [0.1518, 0.1641, 0.7114,  ..., 0.0183, 0.1885, 0.1458],
         [0.2046, 0.2698, 0.2489,  ..., 0.0485, 0.3733, 0.1679]])]

# APPLY EFFICIENT NET

In [16]:
efficient_net = models.efficientnet_b7(pretrained=True).to(device)
efficient_net = nn.Sequential(*(list(efficient_net.children()))[-1])

In [17]:
efficient_net.eval()
features = []
with torch.no_grad():
    for images,_ in data:
        images = images.to(device)
        output = output.view(output.size(0),-1)
        features.append(output)

In [18]:
features

[tensor([[0.1475, 0.3853, 0.5458,  ..., 0.0289, 0.1857, 0.1674],
         [0.0934, 0.3153, 0.2459,  ..., 0.1266, 0.2304, 0.1550],
         [0.2236, 0.4763, 0.1889,  ..., 0.0886, 0.2014, 0.1247],
         ...,
         [0.0757, 0.3484, 0.1690,  ..., 0.0304, 0.0830, 0.0665],
         [0.1518, 0.1641, 0.7114,  ..., 0.0183, 0.1885, 0.1458],
         [0.2046, 0.2698, 0.2489,  ..., 0.0485, 0.3733, 0.1679]],
        device='cuda:0')]

# WORKING ON THE CAPTION DATA

In [23]:
content = {}
file = open("C:\\Python course\\Major\\result.txt","r")
all_text = file.readlines()

In [24]:
def clean_string(text):
    text = text.lower()
    text = re.sub('[\n\.]','',text)
    text = re.sub("[^a-z]+"," ",text)
    text = text.strip()
    return text

In [25]:
content_dictionary = {}
for text in all_text:
    text = text.split("|")
    if text[0].endswith('.jpg'):
        temp_list=[]
        if text[0][:-4] not in content_dictionary:
            clean_text = clean_string(text[-1]) 
            temp_list.append(clean_text)
            content_dictionary[text[0][:-4]] = temp_list
        else:
            clean_text = clean_string(text[-1])
            content_dictionary[text[0][:-4]].append(clean_text)

In [26]:
content_dictionary

{'1000092795': ['two young guys with shaggy hair look at their hands while hanging out in the yard',
  'two young white males are outside near many bushes',
  'two men in green shirts are standing in a yard',
  'a man in a blue shirt standing in a garden',
  'two friends enjoy time spent together'],
 '10002456': ['several men in hard hats are operating a giant pulley system',
  'workers look down from up above on a piece of equipment',
  'two men working on a machine wearing hard hats',
  'four men on top of a tall structure',
  'three men on a large rig'],
 '1000268201': ['a child in a pink dress is climbing up a set of stairs in an entry way',
  'a little girl in a pink dress going into a wooden cabin',
  'a little girl climbing the stairs to her playhouse',
  'a little girl climbing into a wooden playhouse',
  'a girl going into a wooden building'],
 '1000344755': ['someone in a blue shirt and hat is standing on stair and leaning against a window',
  'a man in a blue shirt is standi

In [28]:
print(clean_string("A man in green holds a guitar while the other man observes his shirt ."))


a man in green holds a guitar while the other man observes his shirt
