# MSCOCO download preprocessing

Generates list of training images with captions that contain the model category

In [16]:
data_dir = "/vol/ideadata/ed52egek/data/mscoco/"
split_name = "val2017"
#split_name = "train2017"
import os

# List all images in train2017 folder
images = os.listdir(os.path.join(data_dir,split_name))
# load all images

# check for all classes of the images if the object is in the center


len(images)
paths = [os.path.join(split_name,x) for x in images]
annot = [os.path.join("annotations",f"stuff_{split_name}_pixelmaps",x.replace("jpg", "png")) for x in images]
#limit = 10
#paths = paths[:limit]
#annot = annot[:limit]

In [17]:
import torchvision.transforms as transforms
from PIL import Image
import pandas as pd

to_tensor = transforms.ToTensor()
to_pil = transforms.ToPILImage()


def center_crop(img):
    _center_crop = transforms.CenterCrop(min(img.size()[-2:]))
    return _center_crop(img)

def load_img(path):
    return to_tensor(Image.open(path))

def viz(img):
    to_pil(img).show()


data = pd.DataFrame({"path": paths, "segmentation_path":annot})

In [18]:
path_json = os.path.join(data_dir, "annotations", f"captions_{split_name}.json")
import json

# Open the JSON file
with open(path_json, "r") as json_file:
    # Load the contents of the JSON file
    json_data = json.load(json_file)

# Now you can work with the loaded JSON data
# For example, you can access its elements like a dictionary
json_data.keys()


dict_keys(['info', 'licenses', 'images', 'annotations'])

In [19]:
imgs = {}
for i in range(len(json_data["annotations"])):
    id = json_data["annotations"][i]["image_id"]
    caption = json_data["annotations"][i]["caption"]

    if imgs.get(id) is None:
        imgs[id] = []
    imgs[id].append(caption)

captions_all = []
for i in range(len(data)):
    img_id = int(data.iloc[i]["path"].split("/")[-1].rstrip(".jpg"))
    captions = imgs[img_id]
    captions = "|".join(captions)
    captions_all.append(captions)

data["captions"] = captions_all
data.to_csv(os.path.join(data_dir, split_name + "_meta.csv"))


In [20]:
from src.datasets.mscoco import caption_contains_class_name
captions_all = []
for i in range(len(data)):
    captions = data.iloc[i]["captions"].split("|")
    valid_captions = []
    for caption in captions:
        if caption_contains_class_name(caption):
            valid_captions.append(caption)
    captions = "|".join(valid_captions)
    captions_all.append(captions)

data["captions_filtered"] = captions_all

unfiltered_data = len(data)
data = data[data["captions_filtered"] != ""]
print(f"Num samples: {unfiltered_data} - num samples after filtering captions: {len(data)}")


Num samples: 5000 - num samples after filtering captions: 4567


## Create own split for training and validation of ldm finetuning

In [21]:
from utils import DATASETSPLIT_TO_SPLIT, DatasetSplit

if split_name == "train2017":
    import numpy as np
    np.random.seed(42)
    split = []
    for i in range(len(data)):
        if np.random.rand() > 0.9:
            split.append(DATASETSPLIT_TO_SPLIT["val"])
        else:
            split.append(DATASETSPLIT_TO_SPLIT["train"])

    data["split"] = split

elif split_name == "val2017":
    split = []
    for i in range((len(data))):
        split.append(DATASETSPLIT_TO_SPLIT["test"])
    data["split"] = split



In [22]:
data.to_csv(os.path.join(data_dir, split_name + "_meta.csv"))

In [23]:
sum(split)

0