In [8]:
import numpy as np
import pandas as pd
import pickle
from sklearn.neighbors import NearestNeighbors
# BLUE SCORING
from nltk.translate.bleu_score import corpus_bleu

from src.data.data_fetcher import download
from src.data.image_loader import encoded_image_loader
from src.data.data_processing import DataPreprocessor

Downloading all necessary data from Google Drive and place it in the "data" repo.

In [4]:
# loading the encoded images
with open("./data/encoded_images_PCA.p", "rb") as f:
    encoded_images = pickle.load(f)

# loading the dataframes
train_data = pd.read_csv("./data/flickr_8k_train_dataset.txt", delimiter='\t', header=None, names=['image_id', 'caption'])
test_data = pd.read_csv("./data/flickr_8k_test_dataset.txt", delimiter='\t', header=None, names=['image_id', 'caption'])

# check columns
print(train_data.columns)
print(test_data.columns)

Index(['image_id', 'caption'], dtype='object')
Index(['image_id', 'caption'], dtype='object')


We will create the train and test dataframe in the same way by extracting the encoded images (in vector format) from the dictionary `encoded_images`. Then we convert it into numpy array for matrix manipulation.

In [7]:

# Extraction des noms d'images pour l'entraînement et le test
train_image_names = train_data['image_id'].unique()
test_image_names = test_data['image_id'].unique()

# Création des ensembles de données d'entraînement et de test à partir des images encodées
image_features_train = [encoded_images[img] for img in train_image_names if img in encoded_images]
image_features_test = [encoded_images[img] for img in test_image_names if img in encoded_images]

# Conversion en numpy array
image_features_train = np.array(image_features_train)
image_features_test = np.array(image_features_test)
image_features_train.shape, image_features_test.shape

((6000, 100), (1000, 100))

We will implement an unsupervised Nearest Neighbors model that calculated the nearest neighbors given a distance metric and a minimal number of neighbors.

Once the model trained on the train dataset we compute the most similar neighbor for an image in the test dataset in order to compute the most similar caption.

In [9]:
# ITrain the model
nbrs = NearestNeighbors(n_neighbors=5, algorithm='auto').fit(image_features_train)

In [10]:
# Fonction de recherche des voisins les plus proches
def find_nearest_neighbors(test_image_features, k=5):
    _, indices = nbrs.kneighbors([test_image_features], n_neighbors=k)
    return indices[0]

# Exemple de recherche pour une image de test
test_image_idx = 0  # Modifier l'index pour tester différentes images
test_image_features = image_features_test[test_image_idx]
nearest_neighbors = find_nearest_neighbors(test_image_features)
print("Nearest train image :", nearest_neighbors)

Nearest train image : [1285 4955 3690 3515 5006]


In [13]:
# Récupération des légendes des voisins les plus proches
def get_captions_from_indices(indices):
    captions = []
    for idx in indices:
        img_name = train_image_names[idx]
        caption = train_data[train_data['image_id'] == img_name]['caption'].values[0]
        captions.append(caption)
    return captions

nearest_captions = get_captions_from_indices(nearest_neighbors)
print("Nearest captions :")
for n in nearest_captions:
    print("- ",n)

Nearest captions :
-  <start> A man in a white shirt and sunglasses gazes into the horizon . <end>
-  <start> A person in a scuba suit holds a very small lobster . <end>
-  <start> A man in a black shirt enjoys a snack while a woman in a white shirt looks confused . <end>
-  <start> A goalie is covering his net while two other hockey players chase after the hockey puck . <end>
-  <start> Two girls giving the peace sign . <end>


In [14]:
# Caption generation for the test set
def generate_captions_for_test_set():
    predictions = []
    for i in range(len(image_features_test)):
        test_image_features = image_features_test[i]
        nearest_neighbors = find_nearest_neighbors(test_image_features)
        nearest_captions = get_captions_from_indices(nearest_neighbors)
        predictions.append(nearest_captions[0])  # Using the nearest one.
    return predictions


predicted_captions = generate_captions_for_test_set()

reference_captions = [test_data[test_data['image_id'] == img]['caption'].values[0].split() for img in test_image_names if img in encoded_images]

bleu_score = corpus_bleu([[ref] for ref in reference_captions], [pred.split() for pred in predicted_captions])
print("Score BLEU :", bleu_score)


Score BLEU : 0.029573767019698084
