In [5]:
import os
os.mkdir('/kaggle/tmp')
os.chdir('/kaggle/tmp')

FileExistsError: [Errno 17] File exists: '/kaggle/tmp'

In [6]:
"""
This module downloads Visual-WSD dataset and renames/restructures/simplifies it. 
All trial images are renamed (so they don't cross with train images) and moved to train ones.
All txt files are parsed and combined to single csv.

Final simplified structure of dataset is:
    -data
        -visual_wsd
            -images
                -image.number.jpg
                ...
            -dataset.csv
            
Columns of csv are: ['ambigues word', 'context (phrase)', 'target_image', {image_1 - image_9}(wrong images)]
"""

import asyncio
import os
import re
import shutil
from typing import Literal
from zipfile import ZipFile

import aiohttp
import pandas as pd
from aiohttp import ClientResponse, ClientTimeout

###### UNCOMMENT THIS IF YOU RUN IN JUPYTER ENVIROMENT
import nest_asyncio
nest_asyncio.apply()


class VisualWSDDownloader:
    """
    This class handles the downloading of the Visual-WSD dataset from Google Drive.
    It manages the virus scan page for large files, downloads the dataset in zip format, unzips it, and cleans up by removing the zip file.
    Additionally, it provides functionality to rename the dataset directories to a more manageable format.
    """

    def __init__(
        self, file_gdrive_id: str, zip_file_path: str, extract_to_path: str
    ) -> None:
        self.file_gdrive_id = file_gdrive_id
        self.zip_file_path = zip_file_path
        self.extract_to_path = extract_to_path

    async def download_file_from_google_drive(self) -> None:
        URL = "https://docs.google.com/uc?export=download"

        CUSTOM_TIMEOUT = 1000
        timeout = ClientTimeout(total=CUSTOM_TIMEOUT)

        async with aiohttp.ClientSession(timeout=timeout) as session:
            initial_response = await session.get(
                URL, params={"id": self.file_gdrive_id}
            )
            token = await self.get_confirm_token(initial_response)

            if token:
                params = {"id": self.file_gdrive_id, "confirm": token}
                response = await session.get(URL, params=params)
            else:
                response = initial_response

            await self.save_response_content(response)

    async def get_confirm_token(self, response: ClientResponse) -> str:
        if "text/html" in response.headers.get("Content-Type", ""):
            text = await response.text()
            match = re.search("confirm=([0-9A-Za-z_]+)&", text)
            return match.group(1) if match else None
        return None

    async def save_response_content(self, response: ClientResponse) -> None:
        CHUNK_SIZE = 32768
        with open(self.zip_file_path, "wb") as f:
            async for chunk in response.content.iter_chunked(CHUNK_SIZE):
                if chunk:
                    f.write(chunk)

    def unzip_file(self) -> None:
        with ZipFile(self.zip_file_path, "r") as zip_ref:
            zip_ref.extractall(self.extract_to_path)

    def rename_directories(self) -> None:
        os.chdir("./data/")
        os.rename("./semeval-2023-task-1-V-WSD-train-v1", "./visual_wsd")
        os.rename("./visual_wsd/train_v1", "./visual_wsd/train")
        os.rename("./visual_wsd/trial_v1", "./visual_wsd/trial")
        os.rename("./visual_wsd/train/train_images_v1", "./visual_wsd/train/images")
        os.rename("./visual_wsd/trial/trial_images_v1", "./visual_wsd/trial/images")
        os.chdir("../")
        if os.path.exists(self.zip_file_path):
            print("visual_wsd zip file removed")
            os.remove(self.zip_file_path)

    async def run(self) -> None:
        print("===> Starting Visual_WSD Downloader")
        await self.download_file_from_google_drive()
        print("Visual_WSD dataset downloaded")
        self.unzip_file()
        print("Visual_WSD dataset unzipped")
        self.rename_directories()
        print("Visual_WSD dataset folders renamed\n")


class VisualWSDRestructurer:
    """
    This class is responsible for reorganizing the Visual-WSD dataset.
    It renames and moves trial images to avoid name conflicts with training images, parses text files related to the dataset,
    and combines this information into a single CSV file. The class also restructures the dataset into a simplified format
    with a specific folder structure and dataset CSV.
    """

    def __init__(self, data_path: str, dataset_name: str) -> None:
        self.data_path = data_path
        self.dataset_name = dataset_name
        self.path = os.path.join(data_path, dataset_name)

        self.max_num = self.find_max_image_number(
            os.path.join(self.path, "train", "images")
        )

    def find_max_image_number(self, images_path: str) -> int:
        max_num = 0
        for image_file in os.listdir(images_path):
            num = int(re.search(r"\d+", image_file).group())
            if num > max_num:
                max_num = num
        return max_num + 1

    def rename_move_trial_images(self) -> int:
        trial_images_path = os.path.join(self.path, "trial", "images")
        train_images_path = os.path.join(self.path, "train", "images")
        for filename in os.listdir(trial_images_path):
            match = re.search(r"\d+", filename)
            if match:
                number = int(match.group())
                new_number = self.max_num + number
                new_filename = filename.replace(str(number), str(new_number))
                shutil.move(
                    os.path.join(trial_images_path, filename),
                    os.path.join(train_images_path, new_filename),
                )

        if not os.listdir(trial_images_path):
            shutil.rmtree(trial_images_path)

    def txt2csv(
        self, path: str, datafile: str, goldfile: str, mode: Literal["train", "trial"]
    ) -> None:
        column_names = ["word", "context", "target"] + [
            f"image_{i}" for i in range(1, 11)
        ]

        data_file_path = os.path.join(path, datafile)
        df1 = pd.read_csv(data_file_path, sep="\t", header=None)

        gold_file_path = os.path.join(path, goldfile)
        df2 = pd.read_csv(gold_file_path, sep="\t", header=None)

        combined_df = pd.concat([df1.iloc[:, :2], df2, df1.iloc[:, 2:12]], axis=1)
        combined_df.columns = column_names

        def update_image_name(image_name: str) -> None:
            if mode == "trial":
                num = int(image_name.split(".")[1]) + self.max_num
                return f"image.{num}.jpg"
            return image_name

        combined_df["target"] = combined_df["target"].apply(update_image_name)
        for i in range(1, 11):
            combined_df[f"image_{i}"] = combined_df[f"image_{i}"].apply(
                update_image_name
            )

        combined_df["images"] = combined_df[
            [f"image_{i}" for i in range(1, 11)]
        ].values.tolist()
        combined_df["images"] = combined_df.apply(
            lambda row: [img for img in row["images"] if img != row["target"]], axis=1
        )

        for i in range(1, 10):
            combined_df[f"image_{i}"] = combined_df["images"].apply(
                lambda x: x[i - 1] if i <= len(x) else None
            )
        combined_df.drop(columns=["images", "image_10"], inplace=True)

        combined_df.to_csv(os.path.join(path, "dataset.csv"), index=False)

    def restructure(self) -> None:
        shutil.move(
            os.path.join(self.path, "train", "images"),
            os.path.join(self.path, "images"),
        )
        shutil.move(
            os.path.join(self.path, "train", "dataset.csv"),
            os.path.join(self.path, "dataset.csv"),
        )
        shutil.rmtree(os.path.join(self.path, "train"))
        shutil.rmtree(os.path.join(self.path, "trial"))

    def run(self) -> None:
        print("===> Starting Visual_WSD Restructurer")
        self.rename_move_trial_images()
        print("Visual_WSD dataset trial images reanamed")
        self.txt2csv(
            path=os.path.join(self.path, "train"),
            datafile="train.data.v1.txt",
            goldfile="train.gold.v1.txt",
            mode="train",
        )
        print("Visual_WSD dataset txt files parsed to csv")
        self.restructure()
        print("Visual_WSD dataset restructured\n")


async def main():
    visual_wsd_downloader = VisualWSDDownloader(
        "1byX4wpe1UjyCVyYrT04sW17NnycKAK7N", "./visual_wsd.zip", "./data/"
    )
    await visual_wsd_downloader.run()
    visual_wsd_restructurer = VisualWSDRestructurer("./data", "visual_wsd")
    visual_wsd_restructurer.run()


if __name__ == "__main__":
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())


===> Starting Visual_WSD Downloader
Visual_WSD dataset downloaded
Visual_WSD dataset unzipped
visual_wsd zip file removed
Visual_WSD dataset folders renamed

===> Starting Visual_WSD Restructurer
Visual_WSD dataset trial images reanamed
Visual_WSD dataset txt files parsed to csv
Visual_WSD dataset restructured



In [7]:
import os
from typing import Literal, Optional

import pandas as pd
import PIL
import torch
from PIL import Image
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from torchvision.transforms import Compose

PIL.Image.MAX_IMAGE_PIXELS = 1000000000


class VisualWSDDataset(Dataset):
    """
    This class implements a torch dataset for the Visual-WSD dataset, inheriting from PyTorch's Dataset class.
    The class supports both training and evaluation modes and includes functionality for splitting the dataset
    into training and evaluation subsets. It also supports custom transformations on the images.
    """

    def __init__(
        self,
        path: str,
        csv_file: str,
        images_folder: str,
        transform: Optional[Compose] = None,
        mode: Literal["train", "eval"] = "eval",
        train_ratio: float = 0.8,
    ) -> None:
        self.path = path
        self.df = pd.read_csv(os.path.join(path, csv_file))
        self.images_folder = images_folder
        self.transform = transform
        self.mode = mode
        self.train_ratio = train_ratio

        if mode == "train":
            self.train_data, self.test_data = train_test_split(
                self.df, train_size=train_ratio
            )
        elif mode == "eval":
            self.data = self.df
        else:
            raise ValueError(
                f"Invalid mode. Choose 'train' or 'eval'. Provided mode: {mode}"
            )

    def __len__(self) -> int:
        if self.mode == "train":
            return len(self.train_data)
        else:
            return len(self.data)

    def __getitem__(self, idx: int) -> dict:
        if self.mode == "train":
            row = self.train_data.iloc[idx]
        else:
            row = self.data.iloc[idx]

        target_img_name = os.path.join(self.path, self.images_folder, row["target"])
        try:
            target_image = Image.open(target_img_name).convert("RGB")
        except OSError as e:
            print(f"\nCorrupted image, placeholder image used. Error message: {e}\n")
            return self.__getitem__(0)
        if self.transform:
            target_image = self.transform(target_image)

        candidate_images = []
        for i in range(1, 10):
            img_name = os.path.join(self.path, self.images_folder, row[f"image_{i}"])
            try:
                image = Image.open(img_name).convert("RGB")
            except OSError as e:
                print(
                    f"\nCorrupted image, placeholder image used. Error message: {e}\n"
                )
                return self.__getitem__(0)
            if self.transform:
                image = self.transform(image)
            candidate_images.append(image)
        candidate_images = torch.stack(candidate_images)

        sample = {
            "word": row["word"],
            "context": row["context"],
            "target": torch.Tensor(target_image),
            "candidate_images": candidate_images,
        }
        return sample




In [8]:
import os
import random
from typing import Literal

import numpy as np
import torch
from sklearn.metrics import f1_score, precision_score, recall_score
from torch.utils.data import DataLoader
from torchvision.transforms import (
    CenterCrop,
    Compose,
    InterpolationMode,
    Normalize,
    Resize,
    ToTensor,
)

# from utils import VisualWSDDataset

transform = Compose(
    [
        Resize(224, interpolation=InterpolationMode.BICUBIC),
        CenterCrop(224),
        ToTensor(),
        # Normalize(
        #     (0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)
        # ),
    ]
)


def get_loaders(
    path: str,
    csv_file: str,
    images_folder: str,
    transform: Compose = transform,
    mode: Literal["train", "eval"] = "eval",
    batch_size: int = 1,
    num_workers: int = 0,
    shuffle: bool = True,
    split_ratio: float = 0.8,
) -> DataLoader | tuple[DataLoader, DataLoader]:
    if mode == "eval":
        eval_dataset = VisualWSDDataset(
            path=path,
            csv_file=csv_file,
            images_folder=images_folder,
            transform=transform,
            mode="eval",
        )
        eval_loader = DataLoader(
            eval_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers
        )
        return eval_loader

    elif mode == "train":
        train_dataset = VisualWSDDataset(
            path=path,
            csv_file=csv_file,
            images_folder=images_folder,
            transform=transform,
            mode="train",
            split_ratio=split_ratio,
        )
        train_loader = DataLoader(
            train_dataset,
            batch_size=batch_size,
            shuffle=shuffle,
            num_workers=num_workers,
        )

        test_dataset = VisualWSDDataset(
            path=path,
            csv_file=csv_file,
            images_folder=images_folder,
            transform=transform,
            mode="train",
            split_ratio=split_ratio,
            test_split=True,
        )
        test_loader = DataLoader(
            test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers
        )

        return train_loader, test_loader

    else:
        raise ValueError(
            f"Invalid mode. Choose 'train' or 'eval'. Provided mode: {mode}"
        )


def get_metrics(targets: list, ranks: list) -> tuple[float]:
    accuracy = sum(targets) / len(targets)

    true_targets = [1] * len(targets)

    f1 = f1_score(true_targets, targets)
    prec = precision_score(true_targets, targets)
    rec = recall_score(true_targets, targets)

    mrr = np.mean([1 / rank for rank in ranks])

    return accuracy, f1, prec, rec, mrr


def seed_everything(seed: int) -> None:
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True


In [9]:
from abc import ABC, abstractmethod

import torch
import torch.nn as nn


class BaseModel(ABC, nn.Module):
    """
    An abstract base class for models for Visual-WSD dataset.

    Attributes:
        model: pretrained model.
        processor: wrapped model's image processor and tokenizer into a single processor.
    """

    def __init__(self) -> None:
        super().__init__()
        self.model = None
        self.processor = None

    @abstractmethod
    def process_image(self, images: torch.Tensor) -> torch.Tensor:
        """
        Process the images.

        Args:
            images (torch.Tensor): A tensor containing the one image or stacked multiple images.

        Returns:
            torch.Tensor: The processed images.
        """
        pass

    @abstractmethod
    def process_text(self, texts: list[str]) -> torch.Tensor:
        """
        Process the textual input.

        Args:
            texts (list[str]): textual content (descriptions of images)

        Returns:
            torch.Tensor: The processed text.
        """
        pass

    @abstractmethod
    def forward(self, images: torch.Tensor, texts: list[str]) -> torch.Tensor:
        """
        The forward pass of the model. Should handle both text and image data, and return a tensor of logits,
        where on first place would be logit for target.

        Args:
            images (torch.Tensor): visual content
            texts (list[str]): textual content

        Returns:
            torch.Tensor: A tensor of logits of size [batch_size, 10].
        """
        pass


In [11]:
from io import BytesIO

import requests
import torch
from PIL import Image
from torchvision import transforms
from transformers import CLIPModel, CLIPProcessor

# from base_model import BaseModel


class CLIPMODEL(BaseModel):
    """
    https://huggingface.co/docs/transformers/model_doc/clip
    """

    def __init__(self, model_name):
        super().__init__()
        self.model = CLIPModel.from_pretrained(model_name)
        self.processor = CLIPProcessor.from_pretrained(model_name, do_rescale=False)

        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = self.model.to(self.device)

    def process_image(self, images: torch.Tensor) -> torch.Tensor:
        processed_images = self.processor(images=images, return_tensors="pt", dim=2).to(
            self.device
        )
        return processed_images

    def process_text(self, texts: list[str]) -> torch.Tensor:
        processed_texts = self.processor(
            text=texts, return_tensors="pt", padding=True
        ).to(self.device)
        return processed_texts

    def forward(self, images: torch.Tensor, texts: list[str]) -> torch.Tensor:
        images = images.to(self.device)
        logits = torch.zeros(images.shape[0], images.shape[1])

        for idx, sample_images in enumerate(images):
            processed_sample_images = self.process_image(sample_images)
            processed_phrase = self.process_text(texts[idx])

            output = self.model(
                input_ids=processed_phrase.input_ids,
                pixel_values=processed_sample_images.pixel_values,
                return_dict=True,
            )
            logits[idx] = output.logits_per_image.squeeze(1)

        return logits


## EVERYTHING BELOW IS CHECK
# image_urls = [
#     "http://images.cocodataset.org/val2017/000000039769.jpg",
# ]
# texts = ["a photo of a cat", "a photo of a dog"]


# def load_image(url):
#     response = requests.get(url)
#     img = Image.open(BytesIO(response.content))#.convert("RGB")
#     transform = transforms.ToTensor()
#     return transform(img)


# if __name__ == "__main__":
#     model_name = "openai/clip-vit-base-patch32"
#     model = CLIPMODEL(model_name=model_name)

#     images = torch.stack([load_image(url) for url in image_urls])
#     output = model(images, texts)
#     print(output)


In [18]:
import time

import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from tqdm import tqdm

# from utils import get_metrics


def evaluate_model(
    model: torch.nn.Module, data_loader: DataLoader
) -> dict[str, float | list]:
    start_time = time.time()
    model.eval()

    predicted_images = []  # store which image was predicted
    correct_preds = []  # store whether the target was correctly predicted (1) or (0)
    all_target_ranks = []  # store the rank of the target in each prediction
    phrases = []  # store input phrases for further analysis
    all_probs = []  # store the probabilities for further analysis

    loop = tqdm(enumerate(data_loader), total=len(data_loader))

    with torch.no_grad():
        for idx, batch in loop:
            phrases.extend(list(batch["context"]))
            texts = batch["context"]

            target, candidate_images = batch["target"], batch["candidate_images"]
            images = torch.cat([target.unsqueeze(1), candidate_images], dim=1)

            logits = model(images, texts)
            probs = F.softmax(logits, dim=1)

            top_prob, top_indices = torch.max(probs, dim=1)
            predicted_images.extend([pred.item() for pred in top_indices])

            for i in range(len(top_indices)):
                correct_target = 1 if top_indices[i] == 0 else 0
                correct_preds.append(correct_target)

                rank = (probs[i].sort(descending=True)[1] == 0).nonzero(as_tuple=True)[
                    0
                ].item() + 1
                all_target_ranks.append(rank)

                all_probs.append(probs[i].tolist())
            if idx == 2:
                break

    accuracy, f1, precision, recall, mrr = get_metrics(correct_preds, all_target_ranks)

    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall,
        "mrr": mrr,
        "time": time.time() - start_time,
        "phrases": phrases,
        "predictions": predicted_images,
    }


In [23]:
model_name = "openai/clip-vit-base-patch32"
model = CLIPMODEL(model_name=model_name)

print(model.device)

cuda


In [19]:
loader = get_loaders(
    path = 'data/visual_wsd',
    csv_file = 'dataset.csv',
    images_folder = 'images',
    transform = transform,
    mode = "eval",
    batch_size = 128,
    num_workers = 2
)

In [20]:
res = evaluate_model(model, loader)

  2%|▏         | 2/101 [03:31<2:54:35, 105.82s/it]


In [21]:
print(res)

{'accuracy': 0.71875, 'f1': 0.8363636363636363, 'precision': 0.71875, 'recall': 1.0, 'mrr': 0.8200903191137566, 'time': 211.73718428611755, 'phrases': ['moorhen swamphen', 'serinus genus', 'pegmatite igneous', 'bangalores torpedo', 'bonxie skua', 'ixia genus', 'leucaena genus', 'mahonia genus', 'attalea genus', 'fagaceae family', 'gangster outlaw', 'upset success', 'brevicipitidae family', 'iridium metal', 'breakdown failure', 'catharanthus genus', 'leucanthemum genus', 'biro pen', 'maja genus', 'boletellus genus', 'beater implement', 'capparis genus', 'serenoa genus', 'sticherus genus', 'entoloma genus', 'foulard fabric', 'snert soup', 'biryani dish', 'sobriquet appellation', 'pigiron iron', 'menhaden clupeid', 'sprat sardine', 'groenendael laekenois', 'paddle beat', 'biodiversity diversity', 'protist microorganism', 'beech tree', 'ballroomdance ballroom', 'gourmand feeder', 'bitterroot wildflower', 'phanotron tube', 'ginglymostomatid shark', 'honeylocust locust', 'leucadendron genus'

In [22]:
res

{'accuracy': 0.71875,
 'f1': 0.8363636363636363,
 'precision': 0.71875,
 'recall': 1.0,
 'mrr': 0.8200903191137566,
 'time': 211.73718428611755,
 'phrases': ['moorhen swamphen',
  'serinus genus',
  'pegmatite igneous',
  'bangalores torpedo',
  'bonxie skua',
  'ixia genus',
  'leucaena genus',
  'mahonia genus',
  'attalea genus',
  'fagaceae family',
  'gangster outlaw',
  'upset success',
  'brevicipitidae family',
  'iridium metal',
  'breakdown failure',
  'catharanthus genus',
  'leucanthemum genus',
  'biro pen',
  'maja genus',
  'boletellus genus',
  'beater implement',
  'capparis genus',
  'serenoa genus',
  'sticherus genus',
  'entoloma genus',
  'foulard fabric',
  'snert soup',
  'biryani dish',
  'sobriquet appellation',
  'pigiron iron',
  'menhaden clupeid',
  'sprat sardine',
  'groenendael laekenois',
  'paddle beat',
  'biodiversity diversity',
  'protist microorganism',
  'beech tree',
  'ballroomdance ballroom',
  'gourmand feeder',
  'bitterroot wildflower',
  