In [None]:
%load_ext nb_black
%load_ext pycodestyle_magic 
%pycodestyle_on 

# Visual-WSD

In [None]:
# !gdown https://drive.google.com/u/0/uc?id=1byX4wpe1UjyCVyYrT04sW17NnycKAK7N&export=download
# !unzip ./semeval-2023-task-1-V-WSD-train-v1.zip
# !rm ./semeval-2023-task-1-V-WSD-train-v1.zip

In [None]:
# !mkdir ./dataset
# !mv ./semeval-2023-task-1-V-WSD-train-v1/train_v1 ./dataset/train
# !mv ./dataset/train/train_images_v1 ./dataset/train/images
# !mv ./dataset/train/train.data.v1.txt ./dataset/train/train_data.txt
# !mv ./dataset/train/train.gold.v1.txt ./dataset/train/gold_data.txt

In [None]:
# def parse_gold_to_csv(data_path, gold_path, result_path):
#     with open(data_path, 'r') as f:
#         data_lines = f.readlines()

#     with open(gold_path, 'r') as g:
#         gold_lines = g.readlines()

#     image_paths = []
#     targets = []

#     for i, line in enumerate(data_lines):
#         words = line.strip().split('\t')
#         target = words[1]
#         image_path = words[2:]
#         correct_image_path = gold_lines[i].strip()
#         image_path = [path for path in image_path if path == correct_image_path]
#         image_paths.extend(image_path)
#         targets.extend([target] * len(image_path))

#     df = pd.DataFrame({'image_path': image_paths, 'target': targets})

#     df.to_csv(result_path, index=False)

In [None]:
# parse_gold_to_csv(
#     data_path = './dataset/train/train_data.txt',
#     gold_path = './dataset/train/gold_data.txt',
#     result_path = './dataset/train/combined_gold.csv'
# )

## Models

Visual-semantic embedding models: These models learn joint embeddings for images and text, allowing for similarity comparisons. Some notable models include:
    VSE++: A model that learns a shared embedding space using a combination of CNNs and LSTMs.
    SCAN: A model that utilizes a similarity-based contrastive loss to align images and text in a shared embedding space.

Transformer-based models: Transformer architectures have shown excellent performance in various natural language processing and computer vision tasks. They can be adapted for image-text matching by employing cross-modal attention mechanisms. Examples include:
    UNITER: A model that utilizes cross-modal transformer layers for joint image-text understanding.
    LXMERT: A model that uses a combination of vision transformers and language transformers for cross-modal understanding.

Dual-Stream models: These models have separate streams for image and text processing and incorporate cross-modal interactions at different levels. Notable examples include:
    DAN: A model that employs separate deep networks for images and text, with cross-modal interactions at the final layer.
    MFA: A model that incorporates multi-level feature alignment between images and text using a multi-modal factorized bilinear pooling approach.

Cross-modal Retrieval models: These models focus specifically on retrieval tasks, aiming to find the most relevant images or texts given a query. Some popular models in this category include:
    COSMOS: A model that utilizes deep bilinear modeling and triplet-based ranking losses for cross-modal retrieval.
    M3ER: A model that combines visual attention and language attention mechanisms for cross-modal retrieval tasks.

In [None]:
!wget https://t4.ftcdn.net/jpg/01/77/24/33/240_F_177243386_Luk2Jzj6QO7uWxOJuNTrX8TfPZFVqMj1.jpg
!mv 240_F_177243386_Luk2Jzj6QO7uWxOJuNTrX8TfPZFVqMj1.jpg image.jpg

## CLIP

In [None]:
!pip install git+https://github.com/openai/CLIP.git

In [None]:
import torch
import clip
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)


def encode_image(image_path):
    image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)
    with torch.no_grad():
        image_features = model.encode_image(image)
    return image_features


def encode_text(text):
    text = clip.tokenize([text]).to(device)
    with torch.no_grad():
        text_features = model.encode_text(text)
    return text_features


def calculate_similarity(image_features, text_features):
    similarity = (100.0 * image_features @ text_features.T).squeeze()
    return similarity.item()


image_path = "./image.jpg"
text1 = "dog with ball"
text2 = "children plating"

image_features = encode_image(image_path)
text_features1 = encode_text(text1)
text_features2 = encode_text(text2)

print("Similarity1:", calculate_similarity(image_features, text_features1))
print("Similarity2:", calculate_similarity(image_features, text_features2))

## DAN

In [None]:
import torch
from torchvision.transforms import transforms
from torchvision.models import resnet50
from transformers import BertTokenizer, BertModel
from torch import nn
from PIL import Image


class DAN(nn.Module):
    def __init__(self, vision_model, language_model, embedding_size):
        super(DAN, self).__init__()
        self.vision_model = vision_model
        self.language_model = language_model
        self.fc_vision = nn.Linear(2048, embedding_size)
        self.fc_language = nn.Linear(768, embedding_size)
        self.relu = nn.ReLU()

    def forward(self, image, text):
        image_features = self.vision_model(image)
        image_features = image_features.view(image_features.size(0), -1)
        image_embeddings = self.relu(self.fc_vision(image_features))

        text_embeddings = self.language_model(
            **text
        ).pooler_output  # Extract the pooled output

        return image_embeddings, text_embeddings


vision_model = resnet50(pretrained=True)
vision_model = nn.Sequential(*list(vision_model.children())[:-1])
vision_model = vision_model.eval()

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
language_model = BertModel.from_pretrained("bert-base-uncased")
language_model = language_model.eval()

embedding_size = 768

model = DAN(vision_model, language_model, embedding_size)

image_path = "./image.jpg"
text1 = "dog with ball"
text2 = "children playing"

image_transform = transforms.Compose(
    [
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ]
)

image = Image.open(image_path)
image = image_transform(image).unsqueeze(0)

text_tokens1 = tokenizer.encode_plus(
    text1, add_special_tokens=True, padding="longest", return_tensors="pt"
)
text_tokens2 = tokenizer.encode_plus(
    text2, add_special_tokens=True, padding="longest", return_tensors="pt"
)

image_embeddings, text_embeddings1 = model(image, text_tokens1)
image_embeddings, text_embeddings2 = model(image, text_tokens2)

similarity1 = torch.cosine_similarity(image_embeddings, text_embeddings1, dim=1).item()
similarity2 = torch.cosine_similarity(image_embeddings, text_embeddings2, dim=1).item()

print("Similarity1:", similarity1)
print("Similarity2:", similarity2)

## SCAN

In [None]:
import torch
import clip
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("RN50x4", device=device)


def encode_image(image_path):
    image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)
    with torch.no_grad():
        image_features = model.encode_image(image)
    return image_features


def encode_text(text):
    text = clip.tokenize(text).to(device)
    with torch.no_grad():
        text_features = model.encode_text(text)
    return text_features


def calculate_similarity(image_features, text_features):
    similarity = (100.0 * image_features @ text_features.T).squeeze()
    return similarity.item()


image_path = "./image.jpg"
text1 = "dog with ball"
text2 = "children playing"

image_features = encode_image(image_path)
text_features1 = encode_text(text1)
text_features2 = encode_text(text2)

print("Similarity1:", calculate_similarity(image_features, text_features1))
print("Similarity2:", calculate_similarity(image_features, text_features2))

## VSE++

In [None]:
import torch
from PIL import Image
from torchvision.transforms import functional as F
from transformers import BertModel, BertTokenizer
from sentence_transformers import SentenceTransformer, models

device = "cuda" if torch.cuda.is_available() else "cpu"

text_model_name = "bert-base-uncased"
text_model = BertModel.from_pretrained(text_model_name).to(device)
tokenizer = BertTokenizer.from_pretrained(text_model_name)

visual_model = models.VisionEncoder("visual_model_name")
visual_model.to(device)
visual_model.eval()


def encode_image(image_path):
    image = Image.open(image_path).convert("RGB")
    image = F.resize(image, (224, 224))
    image = F.to_tensor(image).unsqueeze(0).to(device)
    with torch.no_grad():
        image_features = visual_model.encode_image(image)
    return image_features


def encode_text(text):
    inputs = tokenizer.encode_plus(
        text, add_special_tokens=True, return_tensors="pt"
    ).to(device)
    with torch.no_grad():
        text_features = text_model(**inputs)[1]
    return text_features


def calculate_similarity(image_features, text_features):
    similarity = torch.nn.functional.cosine_similarity(
        image_features, text_features
    ).item()
    return similarity


image_path = "./image.jpg"
text1 = "dog with ball"
text2 = "children playing"

image_features = encode_image(image_path)
text_features1 = encode_text(text1)
text_features2 = encode_text(text2)

print("Similarity1:", calculate_similarity(image_features, text_features1))
print("Similarity2:", calculate_similarity(image_features, text_features2))

1) Training:
    a. Collect pairs of images and their corresponding text descriptions.
    b. Convert the text descriptions to embeddings.
    c. Train the model to generate similar embeddings for similar images and text descriptions.
    d. During training, pass both the images and embeddings to the model.

2) Embedding Generation:
    a. After training, utilize the trained model to generate embeddings for all images in the gallery.
    b. Pass each image through the model and save the resulting embedding.
    c. Exclude the text descriptions since only image embeddings are required in this step.
    d. Any images outside dataset can be used

3) Query Processing:
    a. Convert a text query into an embedding using the same method used during training.
    b. Compare the query embedding to all the image embeddings in the gallery.
    c. Utilize a technique like k-nearest neighbors (k-NN) to find the most similar images.

4) Image Retrieval:
    a. Retrieve the images that have the most similar embeddings to the query embedding.
    b. Return these images as the ones deemed most relevant to the query.

In [None]:
!pip install -q kaggle
from google.colab import files

files.upload()  #  kaggle api token

In [None]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d adityajn105/flickr8k

In [None]:
%%capture
!unzip ./flickr8k.zip
!rm flickr8k.zip
!mkdir ./data/
!mv ./Images ./data/images
!mv ./captions.txt ./data/captions.txt
!mkdir ./data/embeddings
!mkdir ./data/checkpoints/

In [None]:
%%capture
!pip install transformers
!pip install rouge
!pip install pycocoevalcap
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git

In [None]:
import pandas as pd
import numpy as np
import albumentations as A
from albumentations.pytorch import ToTensorV2
import cv2
from PIL import Image, ImageFile

ImageFile.LOAD_TRUNCATED_IMAGES = True
import os
from tqdm import tqdm
import random
import clip

import warnings

warnings.filterwarnings("ignore")

import torch
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn import functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.tensorboard import SummaryWriter
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.spice.spice import Spice
from transformers import GPT2Model, GPT2Tokenizer
from transformers import CLIPProcessor, CLIPModel
from transformers import AdamW

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def txt_to_csv(txt_file, csv_file):
    data = []
    with open(txt_file, "r") as file:
        lines = file.readlines()
        for line in lines:
            line = line.strip().split(",")
            image = line[0]
            caption = ", ".join(line[1:])
            data.append({"Image": image, "Caption": caption})

    df = pd.DataFrame(data)
    df.to_csv(csv_file, index=False, header=False)


txt_to_csv(txt_file="./data/captions.txt", csv_file="./data/captions.csv")

Precalculate gpt2 embedding

In [None]:
# def generate_and_save_embeddings(csv_file_path):
#     dataframe = pd.read_csv(csv_file_path)

#     tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
#     tokenizer.pad_token = tokenizer.eos_token
#     model = GPT2Model.from_pretrained('gpt2').to(DEVICE)

#     for index, row in tqdm(dataframe.iterrows()):
#         target = row['caption']
#         image_name = row['image']

#         inputs = tokenizer(target, return_tensors='pt', padding='max_length', truncation=True, max_length=512).to(DEVICE)
#         with torch.no_grad():
#             outputs = model(**inputs)
#         embeddings = outputs.last_hidden_state
#         embeddings = torch.nn.functional.pad(embeddings, pad=(0, 0, 0, 512 - embeddings.size(1)))
#         embeddings = embeddings.to('cpu')

#         torch.save(embeddings, f'./data/embeddings/{image_name}_embedding.pt')


# generate_and_save_embeddings('./data/captions.csv')

Precalculate clip embeddings  
! For training clip model no need for embeddings, input_ids are used, no need to precalculate

In [None]:
# def generate_and_save_clip_embeddings(csv_file, embedding_dir):

#     model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
#     processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     model = model.to(DEVICE)

#     dataframe = pd.read_csv(csv_file)

#     for idx, row in tqdm(dataframe.iterrows()):
#         target = row['caption']

#         inputs = processor(text=target, return_tensors="pt", padding=True, truncation=True)
#         inputs = {name: tensor.to(DEVICE) for name, tensor in inputs.items()}
#         with torch.no_grad():
#             embeddings = model.get_text_features(**inputs)

#         image_name = row['image'].split('.')[0]
#         torch.save(embeddings.cpu(), os.path.join(embedding_dir, f"{image_name}_embedding.pt"))


# generate_and_save_clip_embeddings('./data/captions.csv', './data/embeddings')

In [None]:
class MyDataset(Dataset):
    def __init__(self, csv_file, image_dir, transform=None):
        self.dataframe = pd.read_csv(csv_file)
        self.image_dir = image_dir
        self.transform = transform

        _, self.preprocess = clip.load("ViT-B/32", device=DEVICE, jit=False)

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_name = os.path.join(self.image_dir, self.dataframe.iloc[idx, 0])
        image = Image.open(img_name).convert("RGB")
        image = np.array(image)

        if self.transform:
            transformed = self.transform(image=image)
            image = transformed["image"]

        image = image.permute(1, 2, 0).numpy()
        image = image.astype(np.uint8)
        image = Image.fromarray(image)
        image = self.preprocess(image).float()

        target = self.dataframe.iloc[idx, 1]
        input_ids = clip.tokenize(target)

        return image, input_ids

    # now not used
    @staticmethod
    def collate_fn(batch):
        images, input_ids = zip(*batch)
        images = torch.stack(images)
        input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
        return images, input_ids

In [None]:
def get_mean_std(loader):
    channels_sum, channels_sqrd_sum, num_batches = 0, 0, 0

    for data, _ in tqdm(loader):
        if data is None:
            continue
        try:
            channels_sum += torch.mean(data, dim=[0, 2, 3])
            channels_sqrd_sum += torch.mean(data**2, dim=[0, 2, 3])
            num_batches += 1
        except TypeError:
            continue

    mean = channels_sum / num_batches
    std = (channels_sqrd_sum / num_batches - mean**2) ** 0.5

    return mean, std


# tmp_transform = A.Compose([
#     A.Resize(224, 224),
#     ToTensorV2()
# ])

# dataset = MyDataset(csv_file='./data/captions.csv', image_dir='./data/images/', embeddings_dir='./data/embeddings/', transform=tmp_transform)
# dataloader = DataLoader(dataset, batch_size=64, shuffle=True, num_workers=0)

# mean, std = get_mean_std(dataloader)
# print(mean)
# print(std)

# [116.6669, 113.6163, 102.8635]   for flickr8k
# [70.0214, 68.1754, 71.8770]

In [None]:
MEAN = [116.6669, 113.6163, 102.8635]
STD = [70.0214, 68.1754, 71.8770]


transform = A.Compose(
    [
        A.Resize(224, 224),
        A.HorizontalFlip(p=0.5),
        A.RandomRotate90(p=0.5),
        A.Transpose(p=0.5),
        A.ShiftScaleRotate(
            shift_limit=0.0625,
            scale_limit=0.2,
            rotate_limit=15,
            p=0.9,
            border_mode=cv2.BORDER_REFLECT,
        ),
        A.OneOf([A.IAAAdditiveGaussianNoise(), A.GaussNoise()], p=0.2),
        A.OneOf(
            [
                A.MotionBlur(p=0.2),
                A.MedianBlur(blur_limit=3, p=0.1),
                A.Blur(blur_limit=3, p=0.1),
            ],
            p=0.2,
        ),
        A.OneOf(
            [
                A.CLAHE(clip_limit=2),
                A.IAASharpen(),
                A.IAAEmboss(),
                A.RandomBrightnessContrast(),
            ],
            p=0.3,
        ),
        A.HueSaturationValue(p=0.3),
        A.Normalize(mean=MEAN, std=STD),
        ToTensorV2(),
    ]
)

In [None]:
def create_data_loaders(dataset, batch_size=64, val_split=0.2):
    train_size = int((1 - val_split) * len(dataset))
    val_size = len(dataset) - train_size

    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=0,
        # collate_fn = MyDataset.collate_fn
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=0,
        # collate_fn = MyDataset.collate_fn
    )

    return train_loader, val_loader

In [None]:
def seed_everything(seed=1):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    random.seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True


seed_everything(seed=1)

In [None]:
def save_checkpoint(model, optimizer, epoch, loss, filepath):
    checkpoint = {
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "epoch": epoch,
        "loss": loss,
    }
    torch.save(checkpoint, filepath)


def load_checkpoint(model, optimizer, filepath):
    checkpoint = torch.load(filepath)
    model.load_state_dict(checkpoint["model_state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
    epoch = checkpoint["epoch"]
    loss = checkpoint["loss"]
    return model, optimizer, epoch, loss

We cannot calculate metrics while training, because model produces embeddings in shared space, not final predicions, i.e we dont have ground truth neither predictions

In [None]:
# def calculate_metrics(predictions, ground_truth):
#     predictions = predictions.tolist()
#     ground_truth = ground_truth.tolist()

#     rouge = Rouge()
#     cider = Cider()
#     meteor = Meteor()
#     spice = Spice()

#     bleu_scores = [sentence_bleu([truth], pred) for pred, truth in zip(predictions, ground_truth)]
#     bleu = sum(bleu_scores) / len(bleu_scores)

#     rouge_scores = [rouge.get_scores(pred, truth)[0]['rouge-l']['f'] for pred, truth in zip(predictions, ground_truth)]

#     rouge = sum(rouge_scores) / len(rouge_scores)

#     cider_score, _ = cider.compute_score({i: [truth] for i, truth in enumerate(ground_truth)}, {i: [pred] for i, pred in enumerate(predictions)})

#     meteor_score, _ = meteor.compute_score({i: [truth] for i, truth in enumerate(ground_truth)}, {i: [pred] for i, pred in enumerate(predictions)})

#     spice_score, _ = spice.compute_score({i: [truth] for i, truth in enumerate(ground_truth)}, {i: [pred] for i, pred in enumerate(predictions)})

#     return torch.tensor([bleu, rouge, cider_score, meteor_score, spice_score])

In [None]:
def train_step(model, images, input_ids, optimizer):
    optimizer.zero_grad()
    image_features, text_features = model(images, input_ids)

    logits = image_features @ text_features.t()

    labels = torch.arange(len(images)).to(DEVICE)

    loss = F.cross_entropy(logits, labels) + F.cross_entropy(logits.t(), labels)

    loss.backward()
    optimizer.step()

    return loss.item()


def val_step(model, images, input_ids):
    model.eval()

    with torch.no_grad():
        image_features, text_features = model(images, input_ids)

        logits = image_features @ text_features.t()
        print(image_features.shape, text_features.shape, logits.shape)

        labels = torch.arange(len(images)).to(DEVICE)
        loss = F.cross_entropy(logits, labels) + F.cross_entropy(logits.t(), labels)

    model.train()
    return loss.item(), logits


def train(model, train_loader, val_loader, num_epochs, writer):
    model = model.to(DEVICE)
    model.train()
    optimizer = AdamW(model.parameters())
    scheduler = CosineAnnealingLR(optimizer, len(train_loader) * num_epochs)

    for epoch in range(num_epochs):
        train_loss = 0.0
        val_loss = 0.0

        for images, input_ids in tqdm(train_loader, leave=False):
            images = images.to(DEVICE)
            input_ids = input_ids.squeeze(1).to(DEVICE)
            loss = train_step(model, images, input_ids, optimizer)
            train_loss += loss

        writer.add_scalar("Loss/train", train_loss, epoch)

        if scheduler:
            scheduler.step()

        # metrics = torch.tensor([0, 0, 0, 0, 0])
        for images, input_ids in tqdm(val_loader, leave=False):
            images = images.to(DEVICE)
            input_ids = input_ids.squeeze(1).to(DEVICE)
            loss, predictions = val_step(model, images, input_ids)
            val_loss += loss
            # metrics += calculate_metrics(predictions, input_ids)

        # metrics /= len(val_loader)
        # metrics = dict(zip(['Bleu', 'Rouge', 'Cider', 'Meteor', 'Spice'], metrics))
        writer.add_scalar("Loss/train", val_loss, epoch)

        for metric, value in metrics.items():
            writer.add_scalar(f"Metrics/{metric}", value, epoch)

        print(f"==> Epoch {epoch+1}/{num_epochs}")
        print(f"\tTrain Loss: {train_loss / len(train_loader)}")
        print(f"\tVal Loss: {val_loss / len(val_loader)}")
        # print(f"\tMetrics: {metrics}")

        if epoch % 5 == 0:
            save_checkpoint(
                model,
                optimizer,
                epoch,
                None,
                f"./data/checkpoints/model_epoch_{epoch}.pt",
            )

In [None]:
dataset = MyDataset(
    csv_file="./data/captions.csv", image_dir="./data/images/", transform=transform
)

train_loader, val_loader = create_data_loaders(dataset)

model, _ = clip.load("ViT-B/32", device=DEVICE, jit=False)

writer = SummaryWriter()

In [None]:
train(model, train_loader, val_loader, 10, writer)