In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from numpy.linalg import norm

# Load the dataset
csv_path = 'MS_COCO_2017_links_captions.csv'
df = pd.read_csv(csv_path)
df.head()




Unnamed: 0,image_id,image_url,caption_1,caption_2,caption_3,caption_4,caption_5
0,203564,http://images.cocodataset.org/train2017/000000...,A bicycle replica with a clock as the front wh...,The bike has a clock as a tire.,A black metal bicycle with a clock inside the ...,A bicycle figurine in which the front wheel is...,A clock with the appearance of the wheel of a ...
1,322141,http://images.cocodataset.org/train2017/000000...,A room with blue walls and a white sink and door.,Blue and white color scheme in a small bathroom.,This is a blue and white bathroom with a wall ...,A blue boat themed bathroom with a life preser...,A bathroom with walls that are painted baby blue.
2,16977,http://images.cocodataset.org/train2017/000000...,A car that seems to be parked illegally behind...,two cars parked on the sidewalk on the street,City street with parked cars and a bench.,Cars try to maneuver into parking spaces along...,A couple of cars parked in a busy street sidew...
3,106140,http://images.cocodataset.org/train2017/000000...,A large passenger airplane flying through the ...,There is a GOL plane taking off in a partly cl...,"An airplane that is, either, landing or just t...",An red and white airplane is in the cloudy sky.,A passenger plane taking off into the sky.
4,571635,http://images.cocodataset.org/train2017/000000...,"A bathroom with a toilet, sink, and shower.",A full bathroom with a wicker laundry basket.,A little bathrood decorated with many colorful...,A small bathroom containing a toilet and sink.,"Bathroom containing a toilet, a sink and a wic..."


In [2]:
# Load the sentence transformer model
model = SentenceTransformer('all-mpnet-base-v2')  # 768 dimensions

# cosine similarity
def cosine_sim(a, b):
    return np.dot(a, b) / (norm(a) * norm(b))

# compute mean embedding, then select the caption closest to that mean
def get_closest_caption(row):
    captions = [row[f'caption_{i}'] for i in range(1, 6)]
    embeddings = model.encode(captions)

    mean_embedding = np.mean(embeddings, axis=0)

    sims = [cosine_sim(embed, mean_embedding) for embed in embeddings]
    best_idx = int(np.argmax(sims))

    # return selected caption and its embedding
    return captions[best_idx], embeddings[best_idx].tolist()

df['closest_caption'], df['closest_caption_embedding'] = zip(
    *df.apply(get_closest_caption, axis=1)
)

# output_path = 'coco_with_embeddings2.csv'
# df.to_csv(output_path, index=False)


In [3]:
df.head()

Unnamed: 0,image_id,image_url,caption_1,caption_2,caption_3,caption_4,caption_5,closest_caption,closest_caption_embedding
0,203564,http://images.cocodataset.org/train2017/000000...,A bicycle replica with a clock as the front wh...,The bike has a clock as a tire.,A black metal bicycle with a clock inside the ...,A bicycle figurine in which the front wheel is...,A clock with the appearance of the wheel of a ...,A clock with the appearance of the wheel of a ...,"[-0.010257664136588573, 0.020423714071512222, ..."
1,322141,http://images.cocodataset.org/train2017/000000...,A room with blue walls and a white sink and door.,Blue and white color scheme in a small bathroom.,This is a blue and white bathroom with a wall ...,A blue boat themed bathroom with a life preser...,A bathroom with walls that are painted baby blue.,A bathroom with walls that are painted baby blue.,"[0.010431958362460136, 0.010314032435417175, -..."
2,16977,http://images.cocodataset.org/train2017/000000...,A car that seems to be parked illegally behind...,two cars parked on the sidewalk on the street,City street with parked cars and a bench.,Cars try to maneuver into parking spaces along...,A couple of cars parked in a busy street sidew...,A couple of cars parked in a busy street sidew...,"[-0.005434120539575815, -0.035251740366220474,..."
3,106140,http://images.cocodataset.org/train2017/000000...,A large passenger airplane flying through the ...,There is a GOL plane taking off in a partly cl...,"An airplane that is, either, landing or just t...",An red and white airplane is in the cloudy sky.,A passenger plane taking off into the sky.,A passenger plane taking off into the sky.,"[-0.010122910141944885, 0.013357488438487053, ..."
4,571635,http://images.cocodataset.org/train2017/000000...,"A bathroom with a toilet, sink, and shower.",A full bathroom with a wicker laundry basket.,A little bathrood decorated with many colorful...,A small bathroom containing a toilet and sink.,"Bathroom containing a toilet, a sink and a wic...","Bathroom containing a toilet, a sink and a wic...","[0.007634487468749285, 0.011867981404066086, -..."


In [4]:
columns_to_drop = [f'caption_{i}' for i in range(1, 6)] + [
    'image_url',
    'closest_caption'
]

df_reduced = df.drop(columns=columns_to_drop)
df_reduced.head()


Unnamed: 0,image_id,closest_caption_embedding
0,203564,"[-0.010257664136588573, 0.020423714071512222, ..."
1,322141,"[0.010431958362460136, 0.010314032435417175, -..."
2,16977,"[-0.005434120539575815, -0.035251740366220474,..."
3,106140,"[-0.010122910141944885, 0.013357488438487053, ..."
4,571635,"[0.007634487468749285, 0.011867981404066086, -..."


In [5]:
import pandas as pd

total_rows = len(df_reduced)
chunk_size = total_rows // 3

for i in range(3):
    start = i * chunk_size
    end = (i + 1) * chunk_size if i < 2 else total_rows  
    chunk = df_reduced.iloc[start:end]
    chunk.to_parquet(f'coco_embeddings2_768_part_{i+1}.parquet', index=False)


---

In [7]:

csv_path = 'MS_COCO_2017_links_captions.csv'
df = pd.read_csv(csv_path)

def get_concatenated_embedding(row):
    captions = [row[f'caption_{i}'] for i in range(1, 6)]
    
    # join captions back to back with a separator
    long_text = " ".join(captions)
    
    # single embedding
    embedding = model.encode(long_text)
    
    return embedding.tolist()

df['concatenated_caption_embedding'] = df.apply(get_concatenated_embedding, axis=1)


In [8]:
df.head()

Unnamed: 0,image_id,image_url,caption_1,caption_2,caption_3,caption_4,caption_5,concatenated_caption_embedding
0,203564,http://images.cocodataset.org/train2017/000000...,A bicycle replica with a clock as the front wh...,The bike has a clock as a tire.,A black metal bicycle with a clock inside the ...,A bicycle figurine in which the front wheel is...,A clock with the appearance of the wheel of a ...,"[0.03783470392227173, 0.037530187517404556, 0...."
1,322141,http://images.cocodataset.org/train2017/000000...,A room with blue walls and a white sink and door.,Blue and white color scheme in a small bathroom.,This is a blue and white bathroom with a wall ...,A blue boat themed bathroom with a life preser...,A bathroom with walls that are painted baby blue.,"[0.008814840577542782, 0.011330989189445972, 0..."
2,16977,http://images.cocodataset.org/train2017/000000...,A car that seems to be parked illegally behind...,two cars parked on the sidewalk on the street,City street with parked cars and a bench.,Cars try to maneuver into parking spaces along...,A couple of cars parked in a busy street sidew...,"[0.043987542390823364, -0.01240357756614685, 0..."
3,106140,http://images.cocodataset.org/train2017/000000...,A large passenger airplane flying through the ...,There is a GOL plane taking off in a partly cl...,"An airplane that is, either, landing or just t...",An red and white airplane is in the cloudy sky.,A passenger plane taking off into the sky.,"[0.022646086290478706, 0.02198980003595352, 0...."
4,571635,http://images.cocodataset.org/train2017/000000...,"A bathroom with a toilet, sink, and shower.",A full bathroom with a wicker laundry basket.,A little bathrood decorated with many colorful...,A small bathroom containing a toilet and sink.,"Bathroom containing a toilet, a sink and a wic...","[0.032098181545734406, -0.02034934051334858, -..."


In [9]:
columns_to_drop = [f'caption_{i}' for i in range(1, 6)] + [
    'image_url'
]

df_reduced = df.drop(columns=columns_to_drop)
df_reduced.head()


Unnamed: 0,image_id,concatenated_caption_embedding
0,203564,"[0.03783470392227173, 0.037530187517404556, 0...."
1,322141,"[0.008814840577542782, 0.011330989189445972, 0..."
2,16977,"[0.043987542390823364, -0.01240357756614685, 0..."
3,106140,"[0.022646086290478706, 0.02198980003595352, 0...."
4,571635,"[0.032098181545734406, -0.02034934051334858, -..."


In [10]:
import pandas as pd

total_rows = len(df_reduced)
chunk_size = total_rows // 3

for i in range(3):
    start = i * chunk_size
    end = (i + 1) * chunk_size if i < 2 else total_rows  
    chunk = df_reduced.iloc[start:end]
    chunk.to_parquet(f'coco_embeddings2_768_part_{i+1}.parquet', index=False)
