# **Embedding Extraction**

In this notebook, we extract the image embeddings and store them in a pandas dataframe so out application can use it.

In [None]:
import pandas as pd
import torch.nn as nn
import torch
import os
import torchvision.transforms as transforms

from torch.utils.data import Dataset, DataLoader
from PIL import Image
from tqdm import tqdm
from torchvision.models import efficientnet_b0, EfficientNet_B0_Weights


from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
torch.manual_seed(42)

<torch._C.Generator at 0x793b47f58350>

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
metadata = pd.read_csv("/content/drive/MyDrive/Image captioning/captions.txt", sep="|")
metadata_test = metadata.iloc[:4045]
metadata_val = metadata.iloc[4045:8090]
metadata_train = metadata.iloc[8090:]

In [None]:
class ImageDataset(Dataset):
    def __init__(self, metadata):
        super().__init__()
        self.images = []
        self.metadata = metadata

        for image_name in tqdm(metadata["image_name"].unique()):
            img = Image.open(os.path.join("/content/drive/MyDrive/Image captioning/archive/images", image_name))
            transform = transforms.Compose([
                transforms.Resize(256),
                transforms.CenterCrop(224),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            ])
            img_tensor = transform(img)

            self.images.append(img_tensor)

        print(len(self.images))
        self.images = torch.stack(self.images)
        print(self.images.size())

    def __len__(self):
        return self.images.size()[0]

    def __getitem__(self, idx):
        return (self.images[idx], metadata.loc[idx * 5]["image_name"])

In [None]:
dataset_test = torch.load("/content/drive/MyDrive/Image captioning/saved_data/dataset_test.pt", weights_only=False)
dataset_val = torch.load("/content/drive/MyDrive/Image captioning/saved_data/dataset_val.pt", weights_only=False)
dataset_train = torch.load("/content/drive/MyDrive/Image captioning/saved_data/dataset_train.pt", weights_only=False)

In [None]:
dataloader_train = DataLoader(dataset_train, batch_size=1, shuffle=False)
dataloader_val = DataLoader(dataset_val, batch_size=1, shuffle=False)
dataloader_test = DataLoader(dataset_test, batch_size=1, shuffle=False)

In [None]:
class ImageEncoder(nn.Module):
    def __init__(self, embed_size):
        super().__init__()
        self.efficient_net = efficientnet_b0(weights=EfficientNet_B0_Weights.DEFAULT)
        self.efficient_net.classifier = nn.Identity()
        self.fc1 = nn.Linear(1280, embed_size)

    def forward(self, images):
        features = self.efficient_net(images)
        embeds = self.fc1(features)
        return embeds

In [None]:
image_encoder = ImageEncoder(embed_size=256).to(device)
image_encoder.load_state_dict(torch.load("/content/drive/MyDrive/Image captioning/saved_data/image_encoder_epoch_50.pt"))
image_encoder.eval()

Downloading: "https://download.pytorch.org/models/efficientnet_b0_rwightman-7f5810bc.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b0_rwightman-7f5810bc.pth
100%|██████████| 20.5M/20.5M [00:00<00:00, 62.4MB/s]
  image_encoder.load_state_dict(torch.load("/content/drive/MyDrive/Image captioning/saved_data/image_encoder_epoch_50.pt"))


ImageEncoder(
  (efficient_net): EfficientNet(
    (features): Sequential(
      (0): Conv2dNormActivation(
        (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): SiLU(inplace=True)
      )
      (1): Sequential(
        (0): MBConv(
          (block): Sequential(
            (0): Conv2dNormActivation(
              (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
              (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
              (2): SiLU(inplace=True)
            )
            (1): SqueezeExcitation(
              (avgpool): AdaptiveAvgPool2d(output_size=1)
              (fc1): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
              (fc2): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
              (activation): SiLU(inplace=True)
              (

Here, we fill a dataframe with the image embeddings.

In [None]:
embeddings_dataframe = pd.DataFrame(columns=["image_name", "embedding"])

def fill_dataframe(dataloader):
    for image, name in tqdm(dataloader):
        image = image.to(device)
        embedding = image_encoder(image).detach().cpu()
        row_dict = {"image_name": name[0], "embedding": embedding.squeeze().detach().numpy()}
        embeddings_dataframe.loc[len(embeddings_dataframe)] = row_dict

fill_dataframe(dataloader_train)
fill_dataframe(dataloader_test)
fill_dataframe(dataloader_val)

100%|██████████| 6473/6473 [01:42<00:00, 63.14it/s]
100%|██████████| 809/809 [00:12<00:00, 62.28it/s]
100%|██████████| 809/809 [00:12<00:00, 62.33it/s]


At last, it's best to store the dataset into a "pickle" file, as to not convert the numpy embeddings into strings, as a csv file, for example, would.

In [None]:
embeddings_dataframe.to_pickle("/content/drive/MyDrive/Image captioning/saved_data/embeddings_dataframe.pkl")