## <a id='toc1_1_'></a>[Setup : chargement des données et des modèles](#toc0_)

In [1]:
import os


# Dataset batch size
BATCH_SIZE = 256
# Directory containing the dataset pickles
DATA_DIR = os.path.join("data", "pickle_img_datasets")
# Directory containing images
IMAGES_DIR = os.path.join("data", "images", "image_train")

In [2]:
from tensorflow import keras
import tensorflow as tf
import os
import pickle
from src.data import data


# Load data
X_train = pickle.load(
    open(os.path.join(DATA_DIR, "X_train.pkl"), "rb")).fillna("")
X_test = pickle.load(
    open(os.path.join(DATA_DIR, "X_test.pkl"), "rb")).fillna("")
y_test = pickle.load(open(os.path.join(DATA_DIR, "y_test.pkl"), "rb"))
y_train = pickle.load(open(os.path.join(DATA_DIR, "y_train.pkl"), "rb"))

# Extract the features to be ready for preprocessing
X_train_features = X_train['designation'] + " " + X_train['description']
X_test_features = X_test['designation'] + " " + X_test['description']

# Store the file path to images in variables
X_train_images = data.get_imgs_filenames(
    X_train["productid"], X_train["imageid"], IMAGES_DIR)
X_test_images = data.get_imgs_filenames(
    X_test["productid"], X_test["imageid"], IMAGES_DIR)

# Define DataFrame names for preprocessing
X_train_features.name = "X_train"
X_test_features.name = "X_test"

# Load text model
text_model = keras.models.load_model(
    os.path.join("data", "models", "mlp_text", "mlp_model_v2.1.h5"), compile=False)

# Load image model
image_model = keras.models.load_model(
    os.path.join("data", "models", "cnn_mobilenetv2_keras",
                 "cnn_mobilenetv2.h5"),
    compile=False)

# <a id='toc2_'></a>[Preprocessing du texte](#toc0_)

In [3]:
from src.data.text_preproc_pipeline import TextPreprocess
from src.data.vectorization_pipeline import TfidfStemming


# Data preprocessing
text_preprocessor = TextPreprocess(TfidfStemming())
text_preprocessor.fit(X_train_features)
X_test_preproc = text_preprocessor.transform(X_test_features)
X_train_preproc = text_preprocessor.transform(X_train_features)

INFO:textpipeline:class:TfidfStemming
INFO:textpipeline:TextPreprocess.fit X_train 158.31 seconds
INFO:textpipeline:TextPreprocess.transform X_test 36.67 seconds
INFO:textpipeline:TextPreprocess.transform X_train 213.52 seconds


# <a id='toc3_'></a>[Prédictions du modèle texte](#toc0_)

**Table of contents**<a id='toc0_'></a>    
- [Setup : chargement des données et des modèles](#toc1_1_)    
- [Preprocessing du texte](#toc2_)    
- [Prédictions du modèle texte](#toc3_)    
- [Preprocessing des images](#toc4_)    
- [Prédictions du modèle image](#toc5_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

In [10]:
from sklearn.metrics import accuracy_score
from src.data.data import convert_sparse_matrix_to_sparse_tensor, get_model_prediction


# Predict the categories of X_test
y_pred_text = text_model.predict(
    convert_sparse_matrix_to_sparse_tensor(X_test_preproc))

# Display the accuracy score
print("Text model accuracy score:", accuracy_score(
    y_test, get_model_prediction(y_pred_text)))

Text model accuracy score: 0.8086434291097504


# <a id='toc4_'></a>[Preprocessing des images](#toc0_)

In [4]:
from src.data.data import PRDTYPECODE_DIC, to_simplified_prdtypecode


def open_resize_img(filename: str, y) -> None:
    """
    Open image using the filename and return a resized version of it ready for the image model.

    Argument:
    - filename: complete path to image file including the extension.

    Return:
    - Image matrix in a tensor.
    """
    img = tf.io.read_file(filename)
    img = tf.io.decode_jpeg(img, channels=3)
    return (tf.image.resize(img, [224, 224]), y)


# Convert the prdtypecode to their equivalent in a range from 0 to 26
y_test_simplified = to_simplified_prdtypecode(y_test)
# Transforms y_test to a one hot version
y_test_categorical = tf.keras.utils.to_categorical(
    y_test_simplified, num_classes=len(PRDTYPECODE_DIC.keys()))

# Create the Dataset to feed the model with correctly sized images by batch
test_images_dataset = tf.data.Dataset.from_tensor_slices((X_test_images, y_test_categorical)) \
    .map(open_resize_img) \
    .batch(BATCH_SIZE)

# Convert the prdtypecode to their equivalent in a range from 0 to 26
y_train_simplified = to_simplified_prdtypecode(y_train)
# Transforms y_test to a one hot version
y_train_categorical = tf.keras.utils.to_categorical(
    y_train_simplified, num_classes=len(PRDTYPECODE_DIC.keys()))

# Create the Dataset to feed the model with correctly sized images by batch
train_images_dataset = tf.data.Dataset.from_tensor_slices((X_train_images, y_train_categorical)) \
    .map(open_resize_img) \
    .batch(BATCH_SIZE)

# <a id='toc5_'></a>[Prédictions du modèle image](#toc0_)

In [17]:
from sklearn.metrics import accuracy_score
from src.data.data import get_model_prediction


# Predict the categories of X_test
y_pred_image = image_model.predict(test_images_dataset)

# Display the accuracy score
print("Image model accuracy score:", accuracy_score(
    y_test, get_model_prediction(y_pred_image)))

Image model accuracy score: 0.5444536033914272


# <a id='toc5_'></a>[Prédiction avant dernière couche du modèle texte](#toc0_)

In [6]:
from src.data.data import convert_sparse_matrix_to_sparse_tensor

# text model without head
text_model_wo_head = tf.keras.Model(
                            inputs=text_model.inputs,
                            outputs=text_model.layers[-2].output)

# Predict the output n-1 layer of X_test
test_text_layer = text_model_wo_head.predict(
     convert_sparse_matrix_to_sparse_tensor(X_test_preproc))
# Predict the output n-1 layer of X_train
train_text_layer = text_model_wo_head.predict(
     convert_sparse_matrix_to_sparse_tensor(X_train_preproc))




# <a id='toc5_'></a>[Prédiction avant dernière couche du modèle image](#toc0_)

In [7]:
# image model without head
image_model_wo_head = tf.keras.Model(inputs=image_model.inputs,
                           outputs=image_model.layers[-2].output)

# Predict the output n-1 layer with X_test
test_image_output = image_model_wo_head.predict(test_images_dataset)

# Predict the output n-1 layer with X_train
train_image_output = image_model_wo_head.predict(train_images_dataset)





# <a id='toc5_'></a>[Data concatenation](#toc0_)

In [8]:
import numpy as np

# Text/Image train concatenation 
train_concat_layer = np.concatenate((train_text_layer, train_image_output), axis=1)

filename = os.path.join(DATA_DIR, r'fusion_train_data.pkl')
with open(filename, 'wb') as fp:
    pickle.dump(train_concat_layer, fp)

# Text/Image train concatenation 
test_concat_layer = np.concatenate((test_text_layer, test_image_output), axis=1)

filename = os.path.join(DATA_DIR, r'fusion_test_data.pkl')
with open(filename, 'wb') as fp:
    pickle.dump(test_concat_layer, fp)


In [11]:
# Create dataset
train_fusion_dataset = tf.data.Dataset.from_tensor_slices((train_concat_layer, y_train_categorical)).batch(BATCH_SIZE)
test_fusion_dataset = tf.data.Dataset.from_tensor_slices((test_concat_layer, y_test_categorical)).batch(BATCH_SIZE)

# <a id='toc5_'></a>[Model Fusion definition](#toc0_)

In [13]:
from tensorflow.keras import layers

# Input/Output dimensions 
INPUT_FUSION_SIZE = train_concat_layer.shape[1]
NB_OF_OUTPUT_CLASSES = 27

model = tf.keras.Sequential()
model.add(layers.InputLayer(input_shape=(INPUT_FUSION_SIZE)))
model.add(layers.Dense(units=512, activation='relu'))
model.add(layers.Dropout(rate=0.2, name="Dropout"))

model.add(layers.Dense(units=128, activation='relu'))

outputs = layers.Dense(NB_OF_OUTPUT_CLASSES, 
                       activation='softmax',
                       name="Output")

In [17]:
import datetime

from tensorflow.keras.optimizers import SGD
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, CSVLogger, TensorBoard
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.train import latest_checkpoint


# Checkpoint directory and paths
CHECKPOINT_DIR = os.path.join(
    "data", "models", "fusion_text_image_keras")
CHECKPOINT_PATH = os.path.join(
    CHECKPOINT_DIR, "cp_{val_loss:.2f}-{val_accuracy:.2f}-.ckpt")

# Path to the history CSV file to store training metrics
HIST_CSV_PATH = os.path.join(CHECKPOINT_DIR, "history.csv")

# Define where to store training logs
LOG_DIR = os.path.join(CHECKPOINT_DIR, "logs", "fit")
LOG_DATA = os.path.join(
    LOG_DIR, datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))

# Training
model.build((None, INPUT_FUSION_SIZE))
model.compile(
    optimizer=SGD(learning_rate=0.005, momentum=0.9),
    loss=CategoricalCrossentropy(from_logits=True, label_smoothing=0.1),
    metrics=['accuracy'])

latest = latest_checkpoint(CHECKPOINT_DIR)
if (latest is not None):
    print("Loading checkpoint", latest)
    model.load_weights(latest)
else:
    print("No checkpoint to load")

# Callbacks called between each epoch
cp_callbacks = [
    # Stop the training when there is no improvement in val_accuracy for x epochs
    EarlyStopping(monitor='val_accuracy', patience=10),
    # Save a checkpoint
    ModelCheckpoint(CHECKPOINT_PATH,
                    save_best_only=True,
                    mode="max",
                    monitor="val_accuracy",
                    save_weights_only=True,
                    verbose=1),
    # Insert the metrics into a CSV file
    CSVLogger(HIST_CSV_PATH, separator=',', append=True),
    # Log information to display them in TensorBoard
    TensorBoard(log_dir=LOG_DIR, histogram_freq=1)
]

model.fit(
    train_fusion_dataset,
    epochs=100,
    validation_data=test_fusion_dataset,
    #callbacks=cp_callbacks)
)

No checkpoint to load
Epoch 1/100


ValueError: in user code:

    File "d:\Agnoli\Datascientest\Projet\Fev23_BDS_Rakuten\.conda\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "d:\Agnoli\Datascientest\Projet\Fev23_BDS_Rakuten\.conda\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "d:\Agnoli\Datascientest\Projet\Fev23_BDS_Rakuten\.conda\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "d:\Agnoli\Datascientest\Projet\Fev23_BDS_Rakuten\.conda\lib\site-packages\keras\engine\training.py", line 994, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "d:\Agnoli\Datascientest\Projet\Fev23_BDS_Rakuten\.conda\lib\site-packages\keras\engine\training.py", line 1052, in compute_loss
        return self.compiled_loss(
    File "d:\Agnoli\Datascientest\Projet\Fev23_BDS_Rakuten\.conda\lib\site-packages\keras\engine\compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "d:\Agnoli\Datascientest\Projet\Fev23_BDS_Rakuten\.conda\lib\site-packages\keras\losses.py", line 152, in __call__
        losses = call_fn(y_true, y_pred)
    File "d:\Agnoli\Datascientest\Projet\Fev23_BDS_Rakuten\.conda\lib\site-packages\keras\losses.py", line 272, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "d:\Agnoli\Datascientest\Projet\Fev23_BDS_Rakuten\.conda\lib\site-packages\keras\losses.py", line 1990, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "d:\Agnoli\Datascientest\Projet\Fev23_BDS_Rakuten\.conda\lib\site-packages\keras\backend.py", line 5529, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (None, 27) and (None, 128) are incompatible
