In [57]:
import string
import numpy as np
from PIL import Image
import os
from pickle import dump, load
import numpy as np
import random

from keras.applications.xception import Xception, preprocess_input
from keras.preprocessing.image import load_img, img_to_array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.layers.merge import Concatenate
from keras.models import Model, load_model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout

# small library for seeing the progress of loops.
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  tqdm().pandas()


0it [00:00, ?it/s]

In [33]:
DATASET_RAW = "../data/raw"
DATASET_INTERIM = "../data/interim"

## Load & clean captions

In [32]:
def clean_caption(caption, table):
    caption.replace("-", " ")    # Replace "-" with " "
    words = caption.split()      # Split the words
    
    words = [word.lower() for word in words]    # Convert to lowercase
    words = [word.translate(table) for word in words]  # Remove punctuations
    words = [word for word in words if(len(word)>1)]   # Remove 's and a
    words = [word for word in words if(word.isalpha())] # Remove tokens with numbers
    
    return ' '.join(words)


def load_captions(filename):
    # Load the text file into memory
    file = open(filename, 'r')
    text = file.read()
    file.close()
    table = str.maketrans('','',string.punctuation)
    
    img_captions = dict()
    
    lines = text.split('\n')
    for line in lines[1:]:
        if len(line) == 0:
            continue
        img, caption = line.split(',', 1)
        # Clean the caption text
        caption = clean_caption(caption, table)
        if img in img_captions:
            img_captions[img].append(caption)
        else:
            img_captions[img] = [caption]
        
    return img_captions


def text_vocabulary(img_captions):
    vocab = set()
    for img in img_captions:
        [vocab.update(d.split()) for d in img_captions[img]]

    return vocab

img_captions = load_captions(os.path.join(DATASET_RAW, 'captions.txt'))
print(f"Number of captions: \t{len(img_captions)}")

vocabulary = text_vocabulary(img_captions)
print(f"Length of vocab: \t{len(vocabulary)}")


Number of captions: 	8091
Length of vocab: 	8763


In [36]:
def save_img_captions(img_captions, filename):
    lines = list()
    for key, desc_list in img_captions.items():
        for desc in desc_list:
            lines.append(key + '\t' + desc )
    data = "\n".join(lines)
    file = open(filename,"w")
    file.write(data)
    file.close()
    

save_img_captions(img_captions, os.path.join(DATASET_INTERIM, 'captions.txt'))

## Extracting the features from images 

In [37]:
def extract_img_features(directory):
    model = Xception(include_top=False, pooling='avg')
    features = {}
    for img in tqdm(os.listdir(directory)):
        filename = directory + "/" + img
        image = Image.open(filename)
        image = image.resize((299,299))
        image = np.expand_dims(image, axis=0)
        image = image/127.5
        image = image - 1.0

        feature = model.predict(image)
        features[img] = feature
    return features

#2048 feature vector
img_features = extract_img_features(os.path.join(DATASET_RAW, 'Images'))
dump(img_features, open(os.path.join(DATASET_INTERIM, 'img_features.pkl'), 'wb'))

2023-11-04 20:07:09.421439: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-11-04 20:07:09.475971: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-11-04 20:07:09.477446: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-11-04 20:07:09.479969: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropri

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/xception/xception_weights_tf_dim_ordering_tf_kernels_notop.h5


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for img in tqdm(os.listdir(directory)):


  0%|          | 0/8091 [00:00<?, ?it/s]

2023-11-04 20:07:18.987298: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2023-11-04 20:07:23.180721: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8201
2023-11-04 20:07:27.142004: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory


In [48]:
img_features = load(open(os.path.join(DATASET_INTERIM, 'img_features.pkl'),"rb"))

print(f"# of images:\t{len(img_features)}")
print(img_features['1000268201_693b08cb0e.jpg'].shape)

# of images:	8091
(1, 2048)


In [50]:
len(img_captions)

8091

## Split the training/dev/test set

In [60]:
img_files = list(img_captions.keys())

assert len(img_files) > 7000, "The list must have more than 7000 elements."

# Shuffle the list in place
random.shuffle(img_files)

# Split into training, dev, and test datasets
train_imgs = img_files[:6000]
dev_imgs = img_files[6000:7000]
test_imgs = img_files[7000:]  # The remaining part of the list

print(f"train: {len(train_imgs)} | dev: {len(dev_imgs)} | test: {len(test_imgs)}")

train: 6000 | dev: 1000 | test: 1091
