#### all imports

In [2]:
import tensorflow as tf
import os
import numpy as np
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Model
from keras.applications.vgg19 import VGG19 , preprocess_input
from keras.utils import load_img, img_to_array
from tqdm import tqdm
import re
import pickle

In [3]:
import json #to create dataset metadata json file

### initialize a generator for the validation images 
#### Note: change train2014 for the train images

In [4]:
# Define paths to dataset and output files
data_dir = "/kaggle/input/visual-question-answering/"
output_dir = "/kaggle/working/"
image_dir = os.path.join(data_dir, "val2014")
output_file = os.path.join(output_dir, "val_features.npy")

# Define a data generator to preprocess the images
target_size = (224, 224)
datagen = ImageDataGenerator(preprocessing_function=preprocess_input)
generator = datagen.flow_from_directory(
    image_dir,
    target_size=target_size,
    batch_size=32,
    class_mode=None,
    shuffle=False
)

Found 40504 images belonging to 1 classes.


#### load VGG19 and extract images features

In [5]:
# Create an VGG19 model to extract image features
base_model = VGG19(weights='imagenet')

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg19/vgg19_weights_tf_dim_ordering_tf_kernels.h5


In [7]:
model = Model(inputs=base_model.input, outputs=base_model.get_layer('flatten').output)

In [None]:
# Extract image features for each image in the training set
train_features = []
for i in tqdm(range(len(generator))):
    batch = generator.next()
    features = model.predict_on_batch(batch)
    train_features.append(features)

# Concatenate and reshape the extracted features into a numpy array
train_features = np.concatenate(train_features)
train_features = train_features.reshape((len(generator.filenames), -1))

# Save the extracted features to a numpy file
np.save(output_file, train_features)

#### save features with IDs in a dictionary in a pkl file

In [None]:
# add ids to features
img_ids = np.array([int(re.search("[0-9][0-9][0-9][0-9][0-9]+", gen).group()) for gen in generator.filenames])
image_features = {}
for i in range(len(img_ids)):
    image_features[img_ids[i]]= train_features[i]

In [None]:
# save dictionary to test_image_features.pkl file
with open('test_image_features.pkl', 'wb') as fp:
    pickle.dump(image_features, fp)
    print('dictionary saved successfully to file')

# end