In [31]:
# TODO: Continuer à travailler la classe pour l'évaluation sur le jeu de test et finir le pipeline
# TODO: Intégrer les métriques du jeu de test

# README

#TODO Ce notebook présente une analyse de données sur les ventes de jeux vidéo. L'objectif est d'explorer les tendances des ventes en fonction de la plateforme, du genre et de la région.

# PRE-REQUIS

Ce bloc contient tout ce qui est nécessaire pour le fonctionnement des expériences.

## Imports & Configurations

In [32]:
import os
import warnings
from time import time
from datetime import datetime
from pathlib import PosixPath
import re
import io
import subprocess

import pandas as pd
import numpy as np
from PIL import Image

import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.models import Model
from tensorflow.keras.applications.efficientnet import preprocess_input
from tensorflow.keras.optimizers import Adam

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay

from constants import ROOT_FOLDER, IMAGE_FOLDER, ARTIFACTS_FOLDER, DATASET_PATH
from constants import SEED, VAL_SIZE, TEST_SIZE, BATCH_SIZE, SAMPLING, INPUT_RESOLUTION

In [33]:
# Gestion des avertissements
warnings.filterwarnings("ignore", category=FutureWarning)

In [34]:
# Configuration de cuda avec PyTorch
print("Tensorflow version: ", tf.__version__)
cuda_version = subprocess.check_output(["nvidia-smi", "--version"]).decode().strip().split(': ')[-1]
print("Cuda version: ", cuda_version)
cudnn_version = subprocess.check_output(["grep", "-oPm 1", "nvidia_cudnn_cu12-\K[0-9.]+(?=-py3)", "uv.lock"]).decode().strip()
print("CUDNN version: ", cudnn_version)
print()
print("Tensorflow using GPU: ", tf.config.list_physical_devices('GPU')[0])

Tensorflow version:  2.19.0
Cuda version:  12.8
CUDNN version:  9.3.0.75

Tensorflow using GPU:  PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


## Classes et Fonctions

In [35]:
# Block DataLoader
class ImageDataset(tf.keras.utils.Sequence):
    def __init__(
        self,
        dataframe: pd.DataFrame,
        image_dir: PosixPath,
        processor=None,
        **kwargs,
    ):
        """
        Args:
            dataframe (pd.DataFrame): DataFrame containing image file names and labels.
            image_dir (PosixPath): Directory where images are stored.
            processor (AutoImageProcessor, optional): Hugging Face processor for image preprocessing. Defaults to None.
        """
        super().__init__(**kwargs)
        self.dataframe = dataframe
        self.image_dir = image_dir
        self.processor = processor

    def __len__(self):
        return int(np.ceil(len(self.dataframe) / BATCH_SIZE))

    def __getitem__(self, idx):
        # Check if the index is valid
        if idx >= len(self):
            raise IndexError(
                f"Index {idx} out of range for dataset of length {len(self)}"
            )
        # Get the batch of data
        batch_data = self.dataframe.iloc[idx * BATCH_SIZE : (idx + 1) * BATCH_SIZE, :]
        images = []
        labels = []

        for _, row in batch_data.iterrows():
            name, label = row
            img_name = (
                self.image_dir / name
            )  # Assuming image file names are in the first column
            image = Image.open(img_name).convert(
                "RGB"
            )  # Ensure consistent color format
            image = image.resize(
                (INPUT_RESOLUTION[0], INPUT_RESOLUTION[1]), Image.Resampling.BILINEAR
            )  # Resize to model input size

            if self.processor:
                image = self.processor(image)

            images.append(np.array(image))
            labels.append(label)

        # Convert to TensorFlow tensors
        images = tf.convert_to_tensor(images, dtype=tf.float32)
        labels = tf.convert_to_tensor(labels, dtype=tf.int32)
        labels = tf.one_hot(labels, depth=N_CLASSES)

        return images, labels

In [36]:
def split_dataset(df, train_path, val_path, test_path):
    # Splitting the datasets into train, val and test sets
    X_temp, X_test, y_temp, y_test = train_test_split(
        df["image"],
        df["class"],
        test_size=TEST_SIZE,
        random_state=SEED,
        stratify=df["class"],
        shuffle=True,
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp,
        y_temp,
        test_size=VAL_SIZE,
        random_state=SEED,
        stratify=y_temp,
        shuffle=True,
    )

    # Concat X and y for each set
    train = (
        pd.concat([X_train, y_train], axis=1).sample(SAMPLING)
        if SAMPLING
        else pd.concat([X_train, y_train], axis=1)
    )
    train.to_pickle(train_path)
    val = (
        pd.concat([X_val, y_val], axis=1).sample(SAMPLING)
        if SAMPLING
        else pd.concat([X_val, y_val], axis=1)
    )
    val.to_pickle(val_path)
    test = (
        pd.concat([X_test, y_test], axis=1).sample(SAMPLING)
        if SAMPLING
        else pd.concat([X_test, y_test], axis=1)
    )
    test.to_pickle(test_path)


def load_splits(train_path, val_path, test_path):
    # Load the saved files if they exist
    try:
        train = pd.read_pickle(train_path)
        val = pd.read_pickle(val_path)
        test = pd.read_pickle(test_path)
    except FileNotFoundError as e:
        print(e)
        print("This file has not been found. Please check the paths before.")

    # Finally print the shapes of the datasets
    print(f"Train shape: {train.shape}")
    print(f"Val shape: {val.shape}")
    print(f"Test shape: {test.shape}")

    return train, val, test

In [37]:
# Prewarming the model
def warming_up(model, dataset):
    """
    Warming up the model by running a batch through it.
    """
    for x_batch in dataset:
        inputs, labels = x_batch
        _ = model.predict(inputs)
        print("Warming up the model...")
        break


def plot_to_image(fig):
    """Convert a matplotlib figure to a PNG image with a batch dim."""
    buf = io.BytesIO()
    fig.savefig(buf, format="png")  # Sauvegarde la figure dans un buffer
    plt.close(fig)  # Libère la mémoire
    buf.seek(0)  # Repositionne le curseur au début
    image = tf.image.decode_png(buf.getvalue(), channels=4)  # Décodage en tensor RVBA
    image = tf.expand_dims(image, 0)  # Ajoute une dimension pour le batch
    return image


def split_labels_on_and_or_ampersand(labels):
    """
    Insert a newline after each '&' ou 'and' in label names.
    """
    return [re.sub(r"\s*(and|&)\s*", r"\n\1 ", label) for label in labels]


def generate_experiment_id(model_card, freeze_backbone):
    """
    Generate a unique experiment ID based on the current date and time.
    """
    freeze_str = "freezed" if freeze_backbone else "unfreezed"
    return "_".join(
        [
            datetime.now().strftime("%Y%m%d-%H%M%S"),
            model_card.split("/")[-1],
            freeze_str,
        ]
    )

## Préparation des données

### Chargement du dataset

In [38]:
# Loading the pickle dataset_cleaned used with the previous project as a pandas df
df = pd.read_pickle(DATASET_PATH).drop(columns=["product_name", "description"])
print(f"Dataset shape: {df.shape}")
print(f"Dataset columns: {df.columns}")

# Encode the labels with LabelEncoder
le = LabelEncoder()
le.fit(df["class"])
N_CLASSES = len(le.classes_)
CLASSES = le.classes_.tolist()
print(f"Number of classes: {N_CLASSES}")
print(f"Classes: {CLASSES}")

# Finally transform the class column to the encoded labels
df["class"] = le.transform(df["class"])

Dataset shape: (1050, 2)
Dataset columns: Index(['image', 'class'], dtype='object')
Number of classes: 7
Classes: ['Baby Care', 'Beauty and Personal Care', 'Computers', 'Home Decor & Festive Needs', 'Home Furnishing', 'Kitchen & Dining', 'Watches']


### Séparation des données (train/validation/test)

In [39]:
# Define the path to save the splitted cleaned datasets
completion = SAMPLING if SAMPLING else "full"
train_path = ROOT_FOLDER / "data" / f"trainset_{completion}.pickle"
val_path = ROOT_FOLDER / "data" / f"valset_{completion}.pickle"
test_path = ROOT_FOLDER / "data" / f"testset_{completion}.pickle"

# Load the splitted datasets if they exist
if (
    os.path.exists(train_path)
    and os.path.exists(val_path)
    and os.path.exists(test_path)
):
    train, val, test = load_splits(train_path, val_path, test_path)
else:
    # If the one or more files do not exist, split the dataset and save/overwrite the files
    split_dataset(df, train_path, val_path, test_path)
    train, val, test = load_splits(train_path, val_path, test_path)

Train shape: (758, 2)
Val shape: (134, 2)
Test shape: (158, 2)


In [40]:
train.head(5)

Unnamed: 0,image,class
229,caabe6014b914fe2874a9a8d7284f79b.jpg,3
450,95feec21a9d076cff084159d61bf9b8e.jpg,0
798,9993de7e2bcced43dc9edb3b2c81f23d.jpg,1
230,968a2b3be84193e3f755c2fe71033a2c.jpg,3
293,c2efa8aa11898bdb5fc4e46201973a42.jpg,0


In [41]:
val.head(5)

Unnamed: 0,image,class
979,c44a5dc5b5ebe5b3e0535b7c2b7921e4.jpg,0
49,02a53d335775b652f22f41b529b9d646.jpg,1
567,97fba8a02361aa56eaa9fa51bc1d7661.jpg,6
494,a124d6e4c30b00918c594289266a383c.jpg,6
773,109e235d4838002246599f987d935c21.jpg,0


In [42]:
test.head(5)

Unnamed: 0,image,class
11,08452abdadb3db1e686b94a9c52fc7b6.jpg,6
548,2541b59d54a3a9f2681c0049f7ddd85c.jpg,6
696,82fbc93cd45ab747e7e606f2c52c7335.jpg,3
238,2e8df36b35d22cf219cf8bae6c2af752.jpg,5
963,bcb51cec3d290e6a661586d0df30e091.jpg,4


### DataLoader

In [43]:
train_dataset = ImageDataset(
    train,
    image_dir=IMAGE_FOLDER,
    processor=preprocess_input,
)
val_dataset = ImageDataset(
    val,
    image_dir=IMAGE_FOLDER,
    processor=preprocess_input,
)
test_dataset = ImageDataset(
    test,
    image_dir=IMAGE_FOLDER,
    processor=preprocess_input,
)

# EXPERIMENTS

In [44]:
# Enregistre les paramètres du modèle
model_params = {
    "include_top": False,
    "weights": "imagenet",
    "input_shape": (224, 224, 3),
}

# Charge le modèle EfficientNetB0
model = EfficientNetB0(**model_params)

# Affiche les couches du modèle d'entrée et de sortie
print("Input: ", model.input)
print("Output: ", model.output)

# Ajoute les dernières couches denses du modèle
x = tf.keras.layers.GlobalAveragePooling2D(name="avg_pool")(model.output)
x = tf.keras.layers.Dense(1024, activation="relu")(x)
predictions = tf.keras.layers.Dense(
    N_CLASSES, activation="softmax", name="predictions"
)(x)

# Crée un modèle à partir des inputs et des outputs
model = Model(inputs=model.input, outputs=predictions)

model.compile(optimizer=Adam(), loss="categorical_crossentropy", metrics=["accuracy"])

# Définit le nom du modèle à charger
model_name = ARTIFACTS_FOLDER / "effnet.weights.h5"

# Charger les poids du modèle sauvegardés
model.load_weights(model_name)


Input:  <KerasTensor shape=(None, 224, 224, 3), dtype=float32, sparse=False, ragged=False, name=keras_tensor_482>
Output:  <KerasTensor shape=(None, 7, 7, 1280), dtype=float32, sparse=False, ragged=False, name=keras_tensor_719>


  saveable.load_own_variables(weights_store.get(inner_path))


In [46]:
# Define the experiment ID
model_card = "EfficientNetB0_custom"
experiment_id = generate_experiment_id(model_card, freeze_backbone=False)
log_dir = ROOT_FOLDER / "runs" / experiment_id

# Create the writer for TensorBoard
writer = tf.summary.create_file_writer(str(log_dir))

with writer.as_default():
    # TESTING LOOP
    print(f"TESTING EXPERIMENT ID <{experiment_id}>")
    print("==========================")
    # Warming up the model
    warming_up(model, test_dataset)
    # Iterate over the dataset batch by batch
    batch_times = []
    loss, running_time = 0.0, 0.0
    y_pred = []
    y_true = []
    for step, x_batch in enumerate(test_dataset):
        t0 = time()
        inputs, labels = x_batch
        preds = model.predict_on_batch(inputs)
        t1 = time() - t0
        batch_times.append(t1)
        # Compute the loss with preds & running time
        loss += model.evaluate(inputs, labels, verbose=0)[0]
        running_time += t1
        tf.summary.scalar("TimingByStep/test", running_time, step=step)
        # Add the predictions and labels with the argmax to loose the one-hot encoding
        y_pred.extend(np.argmax(preds, axis=1))
        y_true.extend(np.argmax(labels, axis=1))

    classifier_report = classification_report(
        y_true, y_pred, target_names=CLASSES, zero_division=0, output_dict=True
    )
    formatted_labels = split_labels_on_and_or_ampersand(CLASSES)

    # Print the test metrics
    print(
        f"Test Loss: {loss:.4f} | Test Acc: {classifier_report['accuracy']:.4f} | Running steps test time: {np.sum(batch_times):.2f} s."
    )

    # Create the confusion matrix
    cm = ConfusionMatrixDisplay.from_predictions(
        y_true,
        y_pred,
        labels=range(N_CLASSES),
        normalize=None,
        display_labels=formatted_labels,
        values_format=".2g",
        xticks_rotation="vertical",
        colorbar=False,
        cmap=plt.cm.Blues,
    )
    cm.figure_.tight_layout()
    fig = plot_to_image(cm.figure_)

    # Write the confusion matrix
    tf.summary.image("ConfusionMatrix/test", fig, 0)
    # Save the loss and accuracy
    tf.summary.scalar("Accuracy/test", classifier_report["accuracy"], 0)
    # Save the classification report
    for label, metrics in classifier_report.items():
        if isinstance(metrics, dict):
            for metric, value in metrics.items():
                # Add the metric to the scalar
                tf.summary.scalar(f"ClassificationReport/{label}/{metric}", value, 0)
        else:
            tf.summary.scalar(f"ClassificationReport/{label}", metrics, 0)

# Flush the writer
writer.flush()

TESTING EXPERIMENT ID <20250507-143801_EfficientNetB0_custom_unfreezed>
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 203ms/step
Warming up the model...
Test Loss: 13.0194 | Test Acc: 0.7911 | Running steps test time: 1.11 s.
