# Embeddings
Create embeddings for the images.

In [7]:
import threading, math, os, json

import numpy as np
import pandas as pd

from functions import idp, odp, move_files, create_directory_if_not_exists, remove_directory_if_exists, list_directory_if_exists

from keras.applications import ResNet50, ResNet50V2, ResNet101, ResNet101V2, ResNet152, ResNet152V2, InceptionV3, InceptionResNetV2, VGG16, VGG19, Xception
from keras.applications.resnet import preprocess_input as resnet_preprocess_input
from keras.applications.inception_v3 import preprocess_input as inception_preprocess_input
from keras.applications.inception_resnet_v2 import preprocess_input as inception_resnet_v2_preprocess_input
from keras.applications.vgg16 import preprocess_input as vgg16_preprocess_input
from keras.applications.vgg19 import preprocess_input as vgg19_preprocess_input
from keras.applications.xception import preprocess_input as xception_preprocess_input
from keras.layers import GlobalMaxPooling2D
from keras.utils import load_img as keras_load_img
from keras.utils import img_to_array
from keras import Sequential

In [8]:
MODEL = 'ResNet50'                                                      # model name of the pre-trained model to use
IMG_WIDTH, IMG_HEIGHT, CHANNELS = 224, 224, 3                           # model input dimensions
NR_ROWS_PER_THREAD = 6000                                               # nr of articles to process in each thread
ONE_HOT_ENCODE_COLUMNS = ['index_group_no', 'garment_group_no']         # columns to one-hot-encode in the extended embeddings
OTHER_COLUMNS = ['popularity']                                          # columns to add as they are in the extended embeddings

In [9]:
def embeddings_odp(filename, creation=False):
    """
    Get the filename including the path to store/open a file containing embeddings.
    :param filename: the filename of the file
    :param creation: if True, the directory is 'embeddings_creation' instead of 'embeddings'
    :return: the filename with path
    """
    directory = 'embeddings_creation' if creation else 'embeddings'
    create_directory_if_not_exists(directory=odp(filename=directory))
    return odp(filename=f'{directory}/{filename}')

## Fetch model

In [10]:
# https://keras.io/api/applications/
if MODEL == 'ResNet50':
    base_model = ResNet50(weights='imagenet', include_top=False, input_shape = (IMG_WIDTH, IMG_HEIGHT, CHANNELS))
    preprocess_input = resnet_preprocess_input
elif MODEL == 'ResNet50V2':
    base_model = ResNet50V2(weights='imagenet', include_top=False, input_shape = (IMG_WIDTH, IMG_HEIGHT, CHANNELS))
    preprocess_input = resnet_preprocess_input
elif MODEL == 'ResNet101':
    base_model = ResNet101(weights='imagenet', include_top=False, input_shape = (IMG_WIDTH, IMG_HEIGHT, CHANNELS))
    preprocess_input = resnet_preprocess_input
elif MODEL == 'ResNet101V2':
    base_model = ResNet101V2(weights='imagenet', include_top=False, input_shape = (IMG_WIDTH, IMG_HEIGHT, CHANNELS))
    preprocess_input = resnet_preprocess_input
elif MODEL == 'ResNet152':
    base_model = ResNet152(weights='imagenet', include_top=False, input_shape = (IMG_WIDTH, IMG_HEIGHT, CHANNELS))
    preprocess_input = resnet_preprocess_input
elif MODEL == 'ResNet152V2':
    base_model = ResNet152V2(weights='imagenet', include_top=False, input_shape = (IMG_WIDTH, IMG_HEIGHT, CHANNELS))
    preprocess_input = resnet_preprocess_input
elif MODEL == 'InceptionV3':
    base_model = InceptionV3(weights='imagenet', include_top=False, input_shape = (IMG_WIDTH, IMG_HEIGHT, CHANNELS))
    preprocess_input = inception_preprocess_input
elif MODEL == 'InceptionResNetV2':
    base_model = InceptionResNetV2(weights='imagenet', include_top=False, input_shape = (IMG_WIDTH, IMG_HEIGHT, CHANNELS))
    preprocess_input = inception_resnet_v2_preprocess_input
elif MODEL == 'VGG16':
    base_model = VGG16(weights='imagenet', include_top=False, input_shape = (IMG_WIDTH, IMG_HEIGHT, CHANNELS))
    preprocess_input = vgg16_preprocess_input
elif MODEL == 'VGG19':
    base_model = VGG19(weights='imagenet', include_top=False, input_shape = (IMG_WIDTH, IMG_HEIGHT, CHANNELS))
    preprocess_input = vgg19_preprocess_input
elif MODEL == 'Xception':
    base_model = Xception(weights='imagenet', include_top=False, input_shape = (IMG_WIDTH, IMG_HEIGHT, CHANNELS))
    preprocess_input = xception_preprocess_input
else:
    raise Exception('Model not recognized')

base_model.trainable = False
model = Sequential([base_model, GlobalMaxPooling2D()])

model.summary()

2022-11-29 13:41:13.150741: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-29 13:41:13.313508: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/jana/anaconda3/envs/tf/lib/python3.9/site-packages/cv2/../../lib64:
2022-11-29 13:41:13.313521: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1934] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2022-11-29 13:41:13.314041: I tensorflow/core/platform/cpu_feature_guard

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 resnet50 (Functional)       (None, 7, 7, 2048)        23587712  
                                                                 
 global_max_pooling2d (Globa  (None, 2048)             0         
 lMaxPooling2D)                                                  
                                                                 
Total params: 23,587,712
Trainable params: 0
Non-trainable params: 23,587,712
_________________________________________________________________


In [11]:
embedding_shape = model.get_layer(model.layers[1].name).output_shape[1]
embedding_shape

2048

## Embedding pipeline
### Calculating the embeddings using multithreading
Each thread outputs a file with the embeddings for the rows it processed.

In [12]:
article_df = pd.read_feather(idp(filename='articles_processed.feather'))
nr_articles = article_df.shape[0]

In [13]:
def get_embedding_for_image(img_name):
    """
    Smallest piece in the embedding-creation process.
    This function calculates the embedding of a single image.
    :param img_name: the name of the image for which to get the embedding
    :return: the embedding (or a numpy array of zeros if the image couldn't be found or an exception occurred, with shape embedding_shape)
    """
    if img_name == 'does not exist':
        return np.zeros(embedding_shape)

    try:
        img = keras_load_img(idp(f'images/{img_name}'), target_size=(IMG_WIDTH, IMG_HEIGHT))     # load and reshape image
        img_array = img_to_array(img)                                           # convert image to array
        img_array = np.expand_dims(img_array, axis=0)                           # expand dimensions (1, w, h)
        img_array = preprocess_input(img_array)                                 # preprocess input
        del img
        return model.predict(img_array, verbose=0).reshape(-1)
    except Exception:
        return np.zeros(embedding_shape)

def embedding_creation_thread_function(min_row_ind, max_row_ind, thread_nr):
    """
    Extract the rows of the dataframe from index min_ind to index max_ind.
    Then, create embeddings for all images in these rows.
    The embeddings for all selected rows are written to a file numbered by thread_nr.
    :param min_row_ind: smallest row index in the range to retrieve
    :param max_row_ind: largest row index in the range to retrieve
    :param thread_nr: nr of the thread (printing purposes only)
    """
    print(f"[=>    ] Started              : Thread {thread_nr} ({min_row_ind} --> {max_row_ind})")
    part_of_df = article_df.iloc[min_row_ind:max_row_ind]
    map_embeddings_df = part_of_df['image_name'].apply(lambda img_name: get_embedding_for_image(img_name=img_name))
    print(f"[===>  ] Creating embeddings  : Thread {thread_nr} ({min_row_ind} --> {max_row_ind})")
    embeddings_df = map_embeddings_df.apply(pd.Series)
    embeddings_df = embeddings_df.reset_index()
    embeddings_df.columns = embeddings_df.columns.astype(str)
    embeddings_df.to_feather(embeddings_odp(filename=f'embeddings_{thread_nr}.feather', creation=True))
    print(f"[=====>] Finished             : Thread {thread_nr} ({min_row_ind} --> {max_row_ind})")
    return

def run_threaded_embedding_creation():
    """
    Create the embeddings for all images in the dataset.
    The embeddings are created image per image, which makes the RAM consumption is fairly low. Therefore, to speed up things, multithreading is used.
    Each thread calculates the embeddings for embedding_step rows in the dataframe, and writes those embeddings to a file numbered by this thread
    """

    min_row_ind, max_row_ind = 0, NR_ROWS_PER_THREAD    # lower and upperbound of rows to extract within a thread
    thread_nr = 1                                       # only for progress printing
    threads = list()

    # create threads
    while article_df.shape[0] > min_row_ind:
        print(f"Main    : created and started thread {thread_nr}")
        # create and start thread
        thread = threading.Thread(target=embedding_creation_thread_function, args=(min_row_ind, max_row_ind, thread_nr,))
        threads.append(thread)
        thread.start()
        # update parameters
        min_row_ind, max_row_ind = max_row_ind, min(nr_articles, max_row_ind + NR_ROWS_PER_THREAD)
        thread_nr += 1

    # join threads
    for thread_index, thread in enumerate(threads):
        print(f"Main    : next thread to join: {thread_index + 1}")
        thread.join()
        print(f"Main    : thread {thread_index + 1} done")
    return

In [None]:
%%time

run_threaded_embedding_creation()

Main    : created and started thread %d 1
[=>    ] Started              : Thread 1 (0 --> 6000)
Main    : created and started thread %d 2
[=>    ] Started              : Thread 2 (6000 --> 12000)
Main    : created and started thread %d 3
[=>    ] Started              : Thread 3 (12000 --> 18000)
Main    : created and started thread %d 4
[=>    ] Started              : Thread 4 (18000 --> 24000)
Main    : created and started thread %d 5
[=>    ] Started              : Thread 5 (24000 --> 30000)
Main    : created and started thread %d 6
[=>    ] Started              : Thread 6 (30000 --> 36000)
Main    : created and started thread %d 7
[=>    ] Started              : Thread 7 (36000 --> 42000)
Main    : created and started thread %d 8
[=>    ] Started              : Thread 8 (42000 --> 48000)
Main    : created and started thread %d 9
[=>    ] Started              : Thread 9 (48000 --> 54000)
Main    : created and started thread %d 10
[=>    ] Started              : Thread 10 (54000 --> 6

### Join embedding files
The separate dataframes are joined to obtain a single dataframe with all embeddings.

In [None]:
def join_embedding_files():
    """
    Join all embedding files created in the different threads into a single file.
    Some columns need to be removed.
    :return: the dataframe containing all embeddings
    """
    embeddings_list = [pd.read_feather(embeddings_odp(filename=f'embeddings_{i + 1}.feather', creation=True)) for i in range(math.ceil(nr_articles / NR_ROWS_PER_THREAD))]
    all_embeddings = pd.concat(embeddings_list, ignore_index=True)
    if 'Unnamed: 0' in all_embeddings.columns.values:
        all_embeddings = all_embeddings.drop(['Unnamed: 0'], axis=1)
    if 'index' in all_embeddings.columns.values:
        all_embeddings = all_embeddings.drop(['index'], axis=1)
    if not os.path.isdir(odp(filename='embeddings')):
        os.mkdir(odp(filename='embeddings'))
    all_embeddings.to_feather(embeddings_odp(filename='embeddings.feather'))
    remove_directory_if_exists(directory=odp(filename='embeddings_creation'))
    return all_embeddings

In [None]:
%%time

embeddings = join_embedding_files()
embeddings.head()

### Create extended embeddings
Enlarge the embeddings by adding article properties.
- `ONE_HOT_ENCODE_COLUMNS` = list of columns that should be added to the embeddings by one-hot-encoding them
- `OTHER_COLUMNS` = list of columns that should be added straight away

In [None]:
%%time

extended_embeddings = embeddings.copy()
for column in ONE_HOT_ENCODE_COLUMNS + OTHER_COLUMNS:
    extended_embeddings[column] = article_df[column]
extended_embeddings = pd.get_dummies(extended_embeddings, columns=ONE_HOT_ENCODE_COLUMNS)
extended_embeddings.to_feather(embeddings_odp(filename='extended_embeddings.feather'))
extended_embeddings.head()

### Store files
Create a directory characterizing the constants defined earlier, such that we later still know how these embeddings were obtained.

In [None]:
dir_name = f'embeddings_{MODEL}_W{IMG_WIDTH}_H{IMG_HEIGHT}'
create_directory_if_not_exists(directory=embeddings_odp(filename=dir_name))

json.dump({'ONE_HOT_ENCODE_COLUMNS': ONE_HOT_ENCODE_COLUMNS, 'OTHER_COLUMNS': OTHER_COLUMNS}, open(f'{embeddings_odp(dir_name)}/columns.json', 'w'))
filenames = list_directory_if_exists(directory=odp('embeddings'))
move_files(filenames=filenames, dir_from=embeddings_odp(filename='')[:-1], dir_to=embeddings_odp(filename=dir_name))

In [None]:
%reset -f