In [1]:
import numpy as np
import pandas as pd
import ast

Load data for the image tags + confidence scores

In [2]:
coco_df1 = pd.read_csv("MS_COCO_2017_links_captions_tags_0-75000.csv")
coco_df2 = pd.read_csv("MS_COCO_2017_links_captions_tags_75000-118287.csv")
coco_df = pd.concat([coco_df1, coco_df2], ignore_index=True)
coco_df = coco_df[["image_id", "tags", "confidence_scores"]]
print(coco_df.shape[0])
coco_df.head()

118287


Unnamed: 0,image_id,tags,confidence_scores
0,203564,bicycle | clock | white | black | wall | wall ...,"[0.9999895095825195, 0.999981164932251, 0.8835..."
1,322141,bathroom | sink | wall | faucet | bathroom acc...,"[0.999955415725708, 0.99992835521698, 0.968207..."
2,16977,car | bench | park | road | sedan | sit | park...,"[0.9999231100082397, 0.9998983144760132, 0.977..."
3,106140,plane | sky | fly | land | water | take | whit...,"[0.9999842643737793, 0.9638943672180176, 0.956..."
4,571635,bathroom | toilet bowl | sink | bathroom acces...,"[0.9999809265136719, 0.9998869895935059, 0.999..."


Load data for the caption embeddings

In [6]:
embeddings_parts = [
    "Embeddings/coco_embeddings_768_part_1.parquet",
    "Embeddings/coco_embeddings_768_part_2.parquet",
    "Embeddings/coco_embeddings_768_part_3.parquet"
]

embeddings_df = pd.concat(
    [pd.read_parquet(path, columns=["image_id", "avg_caption_embedding"]) for path in embeddings_parts],
    ignore_index=True
)
print(embeddings_df.shape[0])
embeddings_df.head()

118287


Unnamed: 0,image_id,avg_caption_embedding
0,203564,"[-0.0007135845953598619, 0.002707877429202199,..."
1,322141,"[0.022635236382484436, -0.018114957958459854, ..."
2,16977,"[0.004483063705265522, -0.004469511564821005, ..."
3,106140,"[0.0016963969683274627, 0.024761024862527847, ..."
4,571635,"[0.0051090167835354805, -0.014415529556572437,..."


Merge tag + confidence with embeddings

In [9]:
merged_df = pd.merge(coco_df, embeddings_df, on="image_id", how="inner")
print("Merged dataset shape:", merged_df.shape)
merged_df.head()

Merged dataset shape: (118287, 4)


Unnamed: 0,image_id,tags,confidence_scores,avg_caption_embedding
0,203564,bicycle | clock | white | black | wall | wall ...,"[0.9999895095825195, 0.999981164932251, 0.8835...","[-0.0007135845953598619, 0.002707877429202199,..."
1,322141,bathroom | sink | wall | faucet | bathroom acc...,"[0.999955415725708, 0.99992835521698, 0.968207...","[0.022635236382484436, -0.018114957958459854, ..."
2,16977,car | bench | park | road | sedan | sit | park...,"[0.9999231100082397, 0.9998983144760132, 0.977...","[0.004483063705265522, -0.004469511564821005, ..."
3,106140,plane | sky | fly | land | water | take | whit...,"[0.9999842643737793, 0.9638943672180176, 0.956...","[0.0016963969683274627, 0.024761024862527847, ..."
4,571635,bathroom | toilet bowl | sink | bathroom acces...,"[0.9999809265136719, 0.9998869895935059, 0.999...","[0.0051090167835354805, -0.014415529556572437,..."


Format the dataset to only have tuples (image_tag_string, V_caption)

In [14]:
def create_image_tag_string(row):
    tags = row['tags'].split(' | ')
    scores = ast.literal_eval(row['confidence_scores'])
    combined_list = [f"{tag}: {score}" for tag, score in zip(tags, scores)]
    return ", ".join(combined_list)

merged_df['image_tag_string'] = merged_df.apply(create_image_tag_string, axis=1)
final_df = merged_df[['image_tag_string', 'avg_caption_embedding']]
final_df = final_df.rename(columns={'avg_caption_embedding': 'V_caption'})
final_df.head()

Unnamed: 0,image_tag_string,V_caption
0,"bicycle: 0.9999895095825195, clock: 0.99998116...","[-0.0007135845953598619, 0.002707877429202199,..."
1,"bathroom: 0.999955415725708, sink: 0.999928355...","[0.022635236382484436, -0.018114957958459854, ..."
2,"car: 0.9999231100082397, bench: 0.999898314476...","[0.004483063705265522, -0.004469511564821005, ..."
3,"plane: 0.9999842643737793, sky: 0.963894367218...","[0.0016963969683274627, 0.024761024862527847, ..."
4,"bathroom: 0.9999809265136719, toilet bowl: 0.9...","[0.0051090167835354805, -0.014415529556572437,..."


In [None]:
# final_df.to_parquet("Dataset/MS_COCO_2017_tags_embeddings.parquet", index=False)

---

In [4]:
embeddings_parts = [
    "Embeddings/coco_embeddings2_768_part_1.parquet",
    "Embeddings/coco_embeddings2_768_part_2.parquet",
    "Embeddings/coco_embeddings2_768_part_3.parquet"
]

embeddings_df = pd.concat(
    [pd.read_parquet(path, columns=["image_id", "closest_caption_embedding"]) for path in embeddings_parts],
    ignore_index=True
)
print(embeddings_df.shape[0])
embeddings_df.head()

118287


Unnamed: 0,image_id,closest_caption_embedding
0,203564,"[-0.010257664136588573, 0.020423714071512222, ..."
1,322141,"[0.010431958362460136, 0.010314032435417175, -..."
2,16977,"[-0.005434120539575815, -0.035251740366220474,..."
3,106140,"[-0.010122910141944885, 0.013357488438487053, ..."
4,571635,"[0.007634487468749285, 0.011867981404066086, -..."


In [5]:
merged_df = pd.merge(coco_df, embeddings_df, on="image_id", how="inner")
print("Merged dataset shape:", merged_df.shape)
merged_df.head()

Merged dataset shape: (118287, 4)


Unnamed: 0,image_id,tags,confidence_scores,closest_caption_embedding
0,203564,bicycle | clock | white | black | wall | wall ...,"[0.9999895095825195, 0.999981164932251, 0.8835...","[-0.010257664136588573, 0.020423714071512222, ..."
1,322141,bathroom | sink | wall | faucet | bathroom acc...,"[0.999955415725708, 0.99992835521698, 0.968207...","[0.010431958362460136, 0.010314032435417175, -..."
2,16977,car | bench | park | road | sedan | sit | park...,"[0.9999231100082397, 0.9998983144760132, 0.977...","[-0.005434120539575815, -0.035251740366220474,..."
3,106140,plane | sky | fly | land | water | take | whit...,"[0.9999842643737793, 0.9638943672180176, 0.956...","[-0.010122910141944885, 0.013357488438487053, ..."
4,571635,bathroom | toilet bowl | sink | bathroom acces...,"[0.9999809265136719, 0.9998869895935059, 0.999...","[0.007634487468749285, 0.011867981404066086, -..."


In [6]:
def create_image_tag_string(row):
    tags = row['tags'].split(' | ')
    scores = ast.literal_eval(row['confidence_scores'])
    combined_list = [f"{tag}: {score}" for tag, score in zip(tags, scores)]
    return ", ".join(combined_list)

merged_df['image_tag_string'] = merged_df.apply(create_image_tag_string, axis=1)
final_df = merged_df[['image_tag_string', 'closest_caption_embedding']]
final_df = final_df.rename(columns={'closest_caption_embedding': 'V_caption'})
final_df.head()

Unnamed: 0,image_tag_string,V_caption
0,"bicycle: 0.9999895095825195, clock: 0.99998116...","[-0.010257664136588573, 0.020423714071512222, ..."
1,"bathroom: 0.999955415725708, sink: 0.999928355...","[0.010431958362460136, 0.010314032435417175, -..."
2,"car: 0.9999231100082397, bench: 0.999898314476...","[-0.005434120539575815, -0.035251740366220474,..."
3,"plane: 0.9999842643737793, sky: 0.963894367218...","[-0.010122910141944885, 0.013357488438487053, ..."
4,"bathroom: 0.9999809265136719, toilet bowl: 0.9...","[0.007634487468749285, 0.011867981404066086, -..."


---

In [8]:
embeddings_parts = [
    "Embeddings/coco_embeddings3_768_part_1.parquet",
    "Embeddings/coco_embeddings3_768_part_2.parquet",
    "Embeddings/coco_embeddings3_768_part_3.parquet"
]

embeddings_df = pd.concat(
    [pd.read_parquet(path, columns=["image_id", "concatenated_caption_embedding"]) for path in embeddings_parts],
    ignore_index=True
)
print(embeddings_df.shape[0])
embeddings_df.head()

118287


Unnamed: 0,image_id,concatenated_caption_embedding
0,203564,"[0.03783470392227173, 0.037530187517404556, 0...."
1,322141,"[0.008814840577542782, 0.011330989189445972, 0..."
2,16977,"[0.043987542390823364, -0.01240357756614685, 0..."
3,106140,"[0.022646086290478706, 0.02198980003595352, 0...."
4,571635,"[0.032098181545734406, -0.02034934051334858, -..."


In [9]:
merged_df = pd.merge(coco_df, embeddings_df, on="image_id", how="inner")
print("Merged dataset shape:", merged_df.shape)
merged_df.head()

Merged dataset shape: (118287, 4)


Unnamed: 0,image_id,tags,confidence_scores,concatenated_caption_embedding
0,203564,bicycle | clock | white | black | wall | wall ...,"[0.9999895095825195, 0.999981164932251, 0.8835...","[0.03783470392227173, 0.037530187517404556, 0...."
1,322141,bathroom | sink | wall | faucet | bathroom acc...,"[0.999955415725708, 0.99992835521698, 0.968207...","[0.008814840577542782, 0.011330989189445972, 0..."
2,16977,car | bench | park | road | sedan | sit | park...,"[0.9999231100082397, 0.9998983144760132, 0.977...","[0.043987542390823364, -0.01240357756614685, 0..."
3,106140,plane | sky | fly | land | water | take | whit...,"[0.9999842643737793, 0.9638943672180176, 0.956...","[0.022646086290478706, 0.02198980003595352, 0...."
4,571635,bathroom | toilet bowl | sink | bathroom acces...,"[0.9999809265136719, 0.9998869895935059, 0.999...","[0.032098181545734406, -0.02034934051334858, -..."


In [10]:
def create_image_tag_string(row):
    tags = row['tags'].split(' | ')
    scores = ast.literal_eval(row['confidence_scores'])
    combined_list = [f"{tag}: {score}" for tag, score in zip(tags, scores)]
    return ", ".join(combined_list)

merged_df['image_tag_string'] = merged_df.apply(create_image_tag_string, axis=1)
final_df = merged_df[['image_tag_string', 'concatenated_caption_embedding']]
final_df = final_df.rename(columns={'concatenated_caption_embedding': 'V_caption'})
final_df.head()

Unnamed: 0,image_tag_string,V_caption
0,"bicycle: 0.9999895095825195, clock: 0.99998116...","[0.03783470392227173, 0.037530187517404556, 0...."
1,"bathroom: 0.999955415725708, sink: 0.999928355...","[0.008814840577542782, 0.011330989189445972, 0..."
2,"car: 0.9999231100082397, bench: 0.999898314476...","[0.043987542390823364, -0.01240357756614685, 0..."
3,"plane: 0.9999842643737793, sky: 0.963894367218...","[0.022646086290478706, 0.02198980003595352, 0...."
4,"bathroom: 0.9999809265136719, toilet bowl: 0.9...","[0.032098181545734406, -0.02034934051334858, -..."
