## **View Data**

In [None]:
import json
import requests
from PIL import Image
from io import BytesIO
from collections import defaultdict

def load_coco_captions_with_urls(annotation_file):
    with open(annotation_file, 'r') as f:
        coco = json.load(f)

    # Get image_id 
    id_to_url = {img['id']: img['coco_url'] for img in coco['images']}

    # image_url 
    captions_dict = defaultdict(list)
    for ann in coco['annotations']:
        image_id = ann['image_id']
        image_url = id_to_url.get(image_id)
        if image_url:
            captions_dict[image_url].append(ann['caption'])

    return captions_dict

# Load captions
annotation_file = 'captions_train2017.json'
captions_dict = load_coco_captions_with_urls(annotation_file)

# Display first two images and their captions
for image_url, captions in list(captions_dict.items())[:100]:
    # Download and show the image
    # response = requests.get(image_url)
    # image = Image.open(BytesIO(response.content))
    # image.show()

    # Print captions
    print(f"\nImage URL: {image_url}")
    for i, caption in enumerate(captions):
        print(f"Caption {i+1}: {caption}")



Image URL: http://images.cocodataset.org/train2017/000000203564.jpg
Caption 1: A bicycle replica with a clock as the front wheel.
Caption 2: The bike has a clock as a tire.
Caption 3: A black metal bicycle with a clock inside the front wheel.
Caption 4: A bicycle figurine in which the front wheel is replaced with a clock

Caption 5: A clock with the appearance of the wheel of a bicycle 

Image URL: http://images.cocodataset.org/train2017/000000322141.jpg
Caption 1: A room with blue walls and a white sink and door.
Caption 2: Blue and white color scheme in a small bathroom.
Caption 3: This is a blue and white bathroom with a wall sink and a lifesaver on the wall.
Caption 4: A blue boat themed bathroom with a life preserver on the wall
Caption 5: A bathroom with walls that are painted baby blue.

Image URL: http://images.cocodataset.org/train2017/000000016977.jpg
Caption 1: A car that seems to be parked illegally behind a legally parked car
Caption 2: two cars parked on the sidewalk on 

In [2]:
print(f"\nTotal number of unique images: {len(captions_dict)}")



Total number of unique images: 118287


## **Buid dataset with image_id, image_url, and 5 captions**

In [None]:
import json
import csv
from collections import defaultdict

def build_coco_csv(annotation_file, output_csv):
    # Load COCO annotations
    with open(annotation_file, 'r') as f:
        coco = json.load(f)

    # Map image_id to coco_url
    id_to_url = {img['id']: img['coco_url'] for img in coco['images']}

    # Collect captions per image_id
    image_captions = defaultdict(list)
    for ann in coco['annotations']:
        image_id = ann['image_id']
        image_captions[image_id].append(ann['caption'])

    # CSV
    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['image_id', 'image_url', 'caption_1', 'caption_2', 'caption_3', 'caption_4', 'caption_5']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for image_id, captions in image_captions.items():
            if len(captions) >= 5:
                row = {
                    'image_id': image_id,
                    'image_url': id_to_url[image_id],
                    'caption_1': captions[0],
                    'caption_2': captions[1],
                    'caption_3': captions[2],
                    'caption_4': captions[3],
                    'caption_5': captions[4]
                }
                writer.writerow(row)

annotation_file = 'captions_train2017.json'
output_csv = 'coco_image_captions.csv'
build_coco_csv(annotation_file, output_csv)


## **Build dataset with embeddings**

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import ast

# Load the dataset
csv_path = 'coco_image_captions.csv'
df = pd.read_csv(csv_path)

# Load the sentence transformer model
# model = SentenceTransformer('all-MiniLM-L6-v2') # 384 dimensions
model = SentenceTransformer('all-mpnet-base-v2') # 768 dimensions
# model = SentenceTransformer('bert-large-nli-mean-tokens') # 1024 dimensions
# model = SentenceTransformer('hli/lstm-qqp-sentence-transformer') # 2048 dimensions

# Function to encode and average 5 captions
def get_avg_embedding(row):
    captions = [row[f'caption_{i}'] for i in range(1, 6)]
    embeddings = model.encode(captions)
    avg_embedding = np.mean(embeddings, axis=0)
    return avg_embedding.tolist()  # Convert to list for CSV compatibility

# Apply to each row
df['avg_caption_embedding'] = df.apply(get_avg_embedding, axis=1)

output_path = 'coco_with_embeddings.csv'
df.to_csv(output_path, index=False)


In [None]:
model = SentenceTransformer('all-mpnet-base-v2')
sentence = "This is a test sentence."
embeddings = model.encode(sentence)
embedding_size = embeddings.shape[0]

print(f'The embedding size is: {embedding_size}')


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

The embedding size is: 768


In [14]:
df = pd.read_csv('coco_with_embeddings_768.csv')


In [15]:
df.head()

Unnamed: 0,image_id,image_url,caption_1,caption_2,caption_3,caption_4,caption_5,avg_caption_embedding
0,203564,http://images.cocodataset.org/train2017/000000...,A bicycle replica with a clock as the front wh...,The bike has a clock as a tire.,A black metal bicycle with a clock inside the ...,A bicycle figurine in which the front wheel is...,A clock with the appearance of the wheel of a ...,"[-0.0007135845953598619, 0.002707877429202199,..."
1,322141,http://images.cocodataset.org/train2017/000000...,A room with blue walls and a white sink and door.,Blue and white color scheme in a small bathroom.,This is a blue and white bathroom with a wall ...,A blue boat themed bathroom with a life preser...,A bathroom with walls that are painted baby blue.,"[0.022635236382484436, -0.018114957958459854, ..."
2,16977,http://images.cocodataset.org/train2017/000000...,A car that seems to be parked illegally behind...,two cars parked on the sidewalk on the street,City street with parked cars and a bench.,Cars try to maneuver into parking spaces along...,A couple of cars parked in a busy street sidew...,"[0.004483063705265522, -0.004469511564821005, ..."
3,106140,http://images.cocodataset.org/train2017/000000...,A large passenger airplane flying through the ...,There is a GOL plane taking off in a partly cl...,"An airplane that is, either, landing or just t...",An red and white airplane is in the cloudy sky.,A passenger plane taking off into the sky.,"[0.0016963969683274627, 0.024761024862527847, ..."
4,571635,http://images.cocodataset.org/train2017/000000...,"A bathroom with a toilet, sink, and shower.",A full bathroom with a wicker laundry basket.,A little bathrood decorated with many colorful...,A small bathroom containing a toilet and sink.,"Bathroom containing a toilet, a sink and a wic...","[0.0051090167835354805, -0.014415529556572437,..."


In [None]:
columns_to_drop = [f'caption_{i}' for i in range(1, 6)] + ['image_url']
df_reduced = df.drop(columns=columns_to_drop)
df_reduced.head()

Unnamed: 0,image_id,avg_caption_embedding
0,203564,"[-0.0007135845953598619, 0.002707877429202199,..."
1,322141,"[0.022635236382484436, -0.018114957958459854, ..."
2,16977,"[0.004483063705265522, -0.004469511564821005, ..."
3,106140,"[0.0016963969683274627, 0.024761024862527847, ..."
4,571635,"[0.0051090167835354805, -0.014415529556572437,..."


In [None]:
import pandas as pd

total_rows = len(df_reduced)
chunk_size = total_rows // 3

for i in range(3):
    start = i * chunk_size
    end = (i + 1) * chunk_size if i < 2 else total_rows  
    chunk = df_reduced.iloc[start:end]
    chunk.to_parquet(f'coco_embeddings_768_part_{i+1}.parquet', index=False)


TODO:
- Get RAM code
- Modify RAM code to extract tags + confidence score (threshold)
- Combine tags + confidence score with embeddings as the final dataset