## Filtering with food keywords

In [1]:
import json
from tqdm import tqdm

# Load COCO annotation file
with open("/kaggle/input/coco-2017-dataset/coco2017/annotations/captions_train2017.json", "r") as f:
    coco_data = json.load(f)

# Keywords indicative of food context
food_keywords = [
    "food", "pizza", "plate", "bowl", "table", "kitchen", "dining", "meal", "cake", "bread", "sandwich",
    "lunch", "dinner", "breakfast", "cupcake", "cheese", "pasta", "dish", "eating", "cook", "burger",
    "noodle", "chopstick", "restaurant", "cooking", "doughnut", "fork", "spoon", "knife"
]

# Convert to lowercase for easier match
food_keywords = set(k.lower() for k in food_keywords)

# Step 1: Index image info
imgid2info = {img['id']: img for img in coco_data['images']}

# Step 2: Group captions by image_id and check if caption contains food-related keyword
image_dict = {}

for ann in tqdm(coco_data['annotations'], desc="Filtering food captions"):
    img_id = ann['image_id']
    caption = ann['caption'].lower()

    if any(kw in caption for kw in food_keywords):
        if img_id not in image_dict:
            image_info = imgid2info[img_id]
            image_dict[img_id] = {
                "filename": f"{img_id:012d}.jpg",
                "url": f"http://images.cocodataset.org/train2017/{img_id:012d}.jpg",
                "sentences": []
            }
        image_dict[img_id]["sentences"].append({
            "raw": ann['caption'],
            "tokens": ann['caption'].lower().split()
        })

# Step 3: Drop images with no valid captions or no variety
filtered_images = [
    img for img in image_dict.values()
    if len(img['sentences']) >= 2
]

# Step 4: Assign random split (80% train, 10% val, 10% test)
from random import shuffle
shuffle(filtered_images)

N = len(filtered_images)
for i, img in enumerate(filtered_images):
    if i < int(0.8 * N):
        img['split'] = 'train'
    elif i < int(0.9 * N):
        img['split'] = 'val'
    else:
        img['split'] = 'test'

# Step 5: Save as Karpathy-style JSON
karpathy_food_path = "/kaggle/working/coco_food_karpathy.json"
with open(karpathy_food_path, "w") as f:
    json.dump({"images": filtered_images}, f, indent=2)

print(f"✅ Saved filtered food-related COCO captions to: {karpathy_food_path}")
print(f"📊 Total food images: {len(filtered_images)}")

Filtering food captions: 100%|██████████| 591753/591753 [00:03<00:00, 160677.83it/s]


✅ Saved filtered food-related COCO captions to: /kaggle/working/coco_food_karpathy.json
📊 Total food images: 23251


## Preprocess into hdf5
for pretrain

In [2]:
import os
import json
import h5py
import numpy as np
from collections import Counter
from random import seed, choice, sample
from PIL import Image
from tqdm import tqdm
import torchvision.transforms as transforms

# Konfigurasi
json_path = "/kaggle/working/coco_food_karpathy.json"
image_folder = "/kaggle/input/coco-2017-dataset/coco2017/train2017"
output_folder = "/kaggle/working/pretrain_coco_food"
os.makedirs(output_folder, exist_ok=True)

captions_per_image = 5
min_word_freq = 5
max_len = 50
resized_size = 256

# Tokenizer
def tokenize(sentence):
    return sentence.lower().strip().split()

# Load JSON
with open(json_path, 'r') as f:
    data = json.load(f)

# Word frequency
word_freq = Counter()
for img in data['images']:
    for cap in img['sentences']:
        word_freq.update(cap['tokens'])

# Word map
words = [w for w in word_freq.keys() if word_freq[w] >= min_word_freq]
word_map = {k: v+1 for v, k in enumerate(words)}
word_map['<unk>'] = len(word_map) + 1
word_map['<start>'] = len(word_map) + 1
word_map['<end>'] = len(word_map) + 1
word_map['<pad>'] = 0

# Save word map
with open(os.path.join(output_folder, "wordmap_coco_food.json"), "w") as j:
    json.dump(word_map, j)

print("✅ Wordmap saved:", len(word_map), "words")

# Preprocess image
def load_image(img_path):
    img = Image.open(img_path).convert("RGB")
    tf = transforms.Compose([
        transforms.Resize((resized_size, resized_size)),
        transforms.ToTensor()
    ])
    return tf(img)

# Proses per split
for split in ['train', 'val', 'test']:
    split_imgs = [img for img in data['images'] if img['split'] == split]

    with h5py.File(os.path.join(output_folder, f"{split}_images_coco_food.hdf5"), 'w') as h:
        h.attrs['captions_per_image'] = captions_per_image
        images = h.create_dataset('images', (len(split_imgs), 3, resized_size, resized_size), dtype='uint8')

        encoded_captions = []
        caption_lengths = []

        for i, img in enumerate(tqdm(split_imgs, desc=f"{split} images")):
            img_path = os.path.join(image_folder, img["filename"])
            try:
                img_tensor = load_image(img_path) * 255
                images[i] = img_tensor.byte()
            except:
                print("⚠️ Skip corrupt image:", img["filename"])
                continue

            caps = [c["tokens"] for c in img["sentences"] if len(c["tokens"]) <= max_len]
            if len(caps) == 0:
                continue

            if len(caps) < captions_per_image:
                caps = caps + [choice(caps) for _ in range(captions_per_image - len(caps))]
            else:
                caps = sample(caps, k=captions_per_image)

            for c in caps:
                enc = [word_map['<start>']] + [word_map.get(w, word_map['<unk>']) for w in c] + [word_map['<end>']]
                enc += [word_map['<pad>']] * (max_len + 2 - len(enc))
                encoded_captions.append(enc)
                caption_lengths.append(len(c) + 2)

    # Save
    with open(os.path.join(output_folder, f"{split}_captions_coco_food.json"), "w") as f:
        json.dump(encoded_captions, f)
    with open(os.path.join(output_folder, f"{split}_caplength_coco_food.json"), "w") as f:
        json.dump(caption_lengths, f)

print("✅ Preprocessing done and saved to:", output_folder)


✅ Wordmap saved: 4718 words


train images: 100%|██████████| 18600/18600 [06:03<00:00, 51.23it/s]
val images: 100%|██████████| 2325/2325 [00:45<00:00, 51.17it/s]
test images: 100%|██████████| 2326/2326 [00:44<00:00, 52.36it/s]


✅ Preprocessing done and saved to: /kaggle/working/pretrain_coco_food
