## Setup : chargement des données et des modèles

In [8]:
import os


# Dataset batch size
BATCH_SIZE = 256
# Directory containing the dataset pickles
DATA_DIR = os.path.join("data", "pickle_img_datasets")
# Directory containing images
IMAGES_DIR = os.path.join("data", "images", "image_train")

In [2]:
from tensorflow import keras
import tensorflow as tf
import os
import pickle
from src.data import data


# Load data
X_train = pickle.load(
    open(os.path.join(DATA_DIR, "X_train.pkl"), "rb")).fillna("")
X_test = pickle.load(
    open(os.path.join(DATA_DIR, "X_test.pkl"), "rb")).fillna("")
y_test = pickle.load(open(os.path.join(DATA_DIR, "y_test.pkl"), "rb"))

# Extract the features to be ready for preprocessing
X_train_features = X_train['designation'] + " " + X_train['description']
X_test_features = X_test['designation'] + " " + X_test['description']

# Store the file path to images in variables
X_train_images = data.get_imgs_filenames(
    X_train["productid"], X_train["imageid"], IMAGES_DIR)
X_test_images = data.get_imgs_filenames(
    X_test["productid"], X_test["imageid"], IMAGES_DIR)

# Define DataFrame names for preprocessing
X_train_features.name = "X_train"
X_test_features.name = "X_test"

# Load text model
text_model = keras.models.load_model(
    os.path.join("data", "models", "mlp_model_v2.h5"))

# TODO Check why there is warning regarding the optimizer state
# Load image model
image_model = keras.models.load_model(
    os.path.join("data", "models", "cnn_mobilenetv2_keras", "cnn_mobilenetv2.h5"))

2023-04-24 11:11:56.658548: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-04-24 11:11:56.679728: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-24 11:11:57.787066: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-04-24 11:11:57.799378: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to 



### Fonctions à déplacer dans un module une fois le notebook terminé.

In [3]:
import numpy as np
from sklearn.metrics import accuracy_score


# TODO Move the functions bellow into a library
def convert_sparse_matrix_to_sparse_tensor(X):
    """
    Convert sparse matrix to sparce tensor.
    """
    coo = X.tocoo()
    indices = np.mat([coo.row, coo.col]).transpose()
    return tf.sparse.reorder(tf.SparseTensor(indices, coo.data, coo.shape))


PRDTYPECODE_DIC = {10: 0, 40: 1, 50: 2, 60: 3, 1140: 4, 1160: 5, 1180: 6, 1280: 7, 1281: 8, 1300: 9, 1301: 10, 1302: 11, 1320: 12, 1560: 13,
                   1920: 14, 1940: 15, 2060: 16, 2220: 17, 2280: 18, 2403: 19, 2462: 20, 2522: 21, 2582: 22, 2583: 23, 2585: 24, 2705: 25, 2905: 26}


def to_simplified_prdtypecode(y: np.array):
    """
    Convert the prdtypecode into a simplified equivalent ranging from 0 to 26.
    """
    return np.array([PRDTYPECODE_DIC[i] for i in y])


def to_normal_prdtypecode(y: np.array):
    """
    Convert back a simplified prdtypecode (ranging from 0 to 26) to the original prdtypecode.
    """
    return np.array([list(PRDTYPECODE_DIC.keys())[list(PRDTYPECODE_DIC.values()).index(i)] for i in y])


# Dead code at this moment since we are not displaying any confusion matrix. However, it'll be usefull for the last model so I'm keeping the function here.
def get_normal_text_model_prediction(y_pred):
    """
    Get normal prdtypecode from text model prediction.
    """
    list_decision = []
    for y in y_pred:
        list_decision.append(np.argmax(y))
    return np.array(to_normal_prdtypecode(list_decision))

# Preprocessing du texte

In [4]:
from src.data.text_preproc_pipeline import TextPreprocess
from src.data.vectorization_pipeline import TfidfStemming


# TODO See with Heiko if this is the good preprocessing
# Data preprocessing
text_preprocessor = TextPreprocess(TfidfStemming())
text_preprocessor.fit(X_train_features)

# Transform X_test
X_test_preproc = text_preprocessor.transform(X_test_features)

INFO:textpipeline:class:TfidfStemming
INFO:textpipeline:TextPreprocess.fit X_train 40.16 seconds
INFO:textpipeline:TextPreprocess.transform X_test 9.84 seconds


# Prédictions du modèle texte

In [5]:
# Predict the categories of X_test
y_pred_text = text_model.predict(
    convert_sparse_matrix_to_sparse_tensor(X_test_preproc))

# Display the accuracy score
print("Text model accuracy score:", accuracy_score(
    y_test, get_normal_text_model_prediction(y_pred_text)))

2023-04-24 11:12:49.809058: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype variant and shape [16984,3]
	 [[{{node Placeholder/_0}}]]


Text model accuracy score: 0.8133537447008949


# Preprocessing des images

In [9]:
# TODO Move the function bellow into a module
def open_resize_img(filename: str) -> None:
    """
    Open image using the filename and return a resized version of it ready for the image model.

    Argument:
    - filename: complete path to image file including the extension.

    Return:
    - Image matrix in a tensor.
    """
    img = tf.io.read_file(filename)
    img = tf.io.decode_jpeg(img, channels=3)
    return tf.image.resize(img, [224, 224])


test_images_dataset = tf.data.Dataset.from_tensor_slices(X_test_images)
test_images_dataset = test_images_dataset.map(
    open_resize_img).batch(BATCH_SIZE)

# Prédictions du modèle image

In [11]:
from sklearn.metrics import accuracy_score


# Predict the categories of X_test
y_pred_image = image_model.predict(test_images_dataset)

# Display the accuracy score
print("Image model accuracy score:", accuracy_score(
    y_test, get_normal_text_model_prediction(y_pred_image)))

Image model accuracy score: 0.03467969853980217
