- Difficultés à gérer
    - [ ] upskilling timm
    - [ ] Définir les paramètres de base du transform
    - [ ] Encdor les labels proprement

In [1]:
import os
import warnings
from time import time
import requests

import pandas as pd
import numpy as np
from PIL import Image

import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torchvision import transforms
from timm.data.transforms_factory import create_transform

from transformers import AutoModelForImageClassification, AutoImageProcessor, AutoModel

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

from constants import ROOT_FOLDER, SEED, VAL_SIZE, TEST_SIZE, BATCH_SIZE, SAMPLING


In [2]:
# Désactiver les alertes de FutureWarning
warnings.filterwarnings("ignore", category=FutureWarning)

# Print the torch? cuda and cudnn version
print("Torch version: ", torch.__version__)
print("Cuda version: ", torch.version.cuda)
print("CUDNN version: ", torch.backends.cudnn.version())

# Set the device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Torch version:  2.6.0+cu124
Cuda version:  12.4
CUDNN version:  90100
Using device: cuda


[COPY] Récupération des paramètres de train-test-split sur le jeu utilisé

In [3]:
# Assign the folder path containing the images
IMAGE_FOLDER = ROOT_FOLDER / "data" / "images"
IMAGE_TEST = ROOT_FOLDER / "data" / "images" / "2aaa6083689193df5ab01fe37dea1b5e.jpg"
# Assign the folder path containing the former H5 efficientnet weights
ARTIFACTS_FOLDER = ROOT_FOLDER / "artifacts"
# Assign the folder path with the pickle dataset with labels, images filenames and metadata
DATASET_PATH = ROOT_FOLDER / "data" / "dataset_cleaned.pkl"

In [4]:
# Loading the pickle dataset_cleaned used with the previous project as a pandas df
df = pd.read_pickle(DATASET_PATH).drop(columns=['product_name', 'description'])
print(f"Dataset shape: {df.shape}")
print(f"Dataset columns: {df.columns}")

# Encode the labels with LabelEncoder
le = LabelEncoder()
le.fit(df["class"])
n_classes = len(le.classes_)
classes = le.classes_.tolist()
print(f"Number of classes: {n_classes}")
print(f"Classes: {classes}")

# Finally transform the class column to the encoded labels
df["class"] = le.transform(df["class"])

Dataset shape: (1050, 2)
Dataset columns: Index(['image', 'class'], dtype='object')
Number of classes: 7
Classes: ['Baby Care', 'Beauty and Personal Care', 'Computers', 'Home Decor & Festive Needs', 'Home Furnishing', 'Kitchen & Dining', 'Watches']


In [5]:
# Splitting the datasets into train, val and test sets
X_temp, X_test, y_temp, y_test = train_test_split(df['image'], df['class'], test_size=TEST_SIZE, random_state=SEED, stratify=df['class'], shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=VAL_SIZE, random_state=SEED, stratify=y_temp, shuffle=True)

# Concat X and y for each set
train = pd.concat([X_train, y_train], axis=1).sample(SAMPLING) if SAMPLING else pd.concat([X_train, y_train], axis=1)
val = pd.concat([X_val, y_val], axis=1).sample(SAMPLING) if SAMPLING else pd.concat([X_val, y_val], axis=1)
test = pd.concat([X_test, y_test], axis=1).sample(SAMPLING) if SAMPLING else pd.concat([X_test, y_test], axis=1)

# Print the shape of each set
print(f"Train shape: {train.shape}")
print(f"Val shape: {val.shape}")
print(f"Test shape: {test.shape}")

Train shape: (32, 2)
Val shape: (32, 2)
Test shape: (32, 2)


In [6]:
train.head(5)

Unnamed: 0,image,class
708,ce9207944cedeaa82e4ea6269586af2a.jpg,4
344,6e44d107ee32412243b19b0ed9b415f3.jpg,1
307,a12d9ae5720ae41446e084911f0c2865.jpg,3
234,ed139e0d1b5c973495e1aa35dd4a5533.jpg,5
461,3b80ac036843b278083fabfd9a3c84ff.jpg,5


In [7]:
val.head(5)

Unnamed: 0,image,class
388,98ad5b99ad96695568d8f143b11ab740.jpg,1
7,dd0e3470a7e6ed76fd69c2da27721041.jpg,6
282,a8ea6fc2b3cd95f46bced80853ce8e0e.jpg,0
622,672d1c3272eae4586eb5994fe408c12a.jpg,0
447,8e961e4fd248c9496ca54808d2d2b25f.jpg,2


In [8]:
test.head(5)

Unnamed: 0,image,class
864,19d2dbc1789653c58bce08c169662cf2.jpg,2
861,ad08a6efe82576ab162a9336feff647f.jpg,2
846,82466269245f199d7f850509307a497b.jpg,2
74,7a3f11f380a1bf85338ab3771ff81e9f.jpg,0
899,f0ab5f12bd777e28a401a728fcb93a4b.jpg,3


In [9]:
# Block DataLoader
class ImageDataset(Dataset):
    def __init__(self, dataframe, image_dir, processor=None, transform=None):
        """
        Args:
            dataframe (pd.DataFrame): DataFrame containing image file names and labels.
            image_dir (str): Directory where images are stored.
            processor (AutoImageProcessor, optional):  Hugging Face processor for image preprocessing. Defaults to None.
            transform (callable, optional): Optional transform to be applied on a sample. Defaults to None.
        """
        self.dataframe = dataframe
        self.image_dir = image_dir
        self.processor = processor
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_name = os.path.join(self.image_dir, self.dataframe.iloc[idx, 0])  # Assuming image file names are in the first column
        image = Image.open(img_name).convert('RGB')  # Ensure consistent color format

        label = self.dataframe.iloc[idx, 1]  # Assuming labels are in the second column

        if self.processor:
            inputs = self.processor(images=image, return_tensors="pt")
            image = inputs['pixel_values'].squeeze()  # Remove batch dimension
        elif self.transform:
            image = self.transform(image)

        # Convert label to tensor
        label = torch.tensor(label)

        return image, label

# **EXPERIMENTS**

## **Google VIT**

In [10]:
# Assigne the model card name
model_card = "google/vit-base-patch16-224-in21k"

# Define the model
model = AutoModelForImageClassification.from_pretrained(model_card, num_labels=n_classes, trust_remote_code=True)

# Define the image processor
processor = AutoImageProcessor.from_pretrained(model_card)

# Prepare the dataloaders for training, validation and testing
dataset = ImageDataset(dataframe=train, image_dir=IMAGE_FOLDER, processor=processor)
train_loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
dataset = ImageDataset(dataframe=val, image_dir=IMAGE_FOLDER, processor=processor)
val_loader = DataLoader(dataset, batch_size=BATCH_SIZE)
dataset = ImageDataset(dataframe=test, image_dir=IMAGE_FOLDER, processor=processor)
test_loader = DataLoader(dataset, batch_size=BATCH_SIZE)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


In [11]:
best_val_metric = float('-inf')
best_epoch = 0

# Move model to the device
model.to(device)
# Set the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
# Set the number of epochs
num_epochs = 3

# Set the loss function
criterion = torch.nn.CrossEntropyLoss()

# Training loop
for epoch in range(num_epochs):
    # --- 1. TRAINING LOOP ---
    running_loss = 0.0
    model.train()
    for inputs, labels in train_loader:
        # Move images and labels to the device
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs).logits
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(train_loader):.4f}")

    # --- 2. VALIDATION LOOP ---
    model.eval()
    val_loss, correct, total = 0.0, 0, 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            # Move images and labels to the device
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(inputs).logits
            val_loss   += criterion(outputs, labels).item() * inputs.size(0)
            preds      = outputs.argmax(dim=1)
            correct   += (preds == labels).sum().item()
            total     += inputs.size(0)

    avg_val_loss = val_loss / total
    val_acc      = correct / total
    running_loss = 0.0
    print(f"Epoch [{epoch + 1}/{num_epochs}], Val Loss: {avg_val_loss:.4f}, Val Acc: {val_acc:.4f}")
    
    # --- 3. UPDATE BEST MODEL ---
    if val_acc > best_val_metric:
        best_val_metric = val_acc
        best_epoch      = epoch
        torch.save(model.state_dict(), ARTIFACTS_FOLDER / f'best_{model_card.split("/")[-1].replace("-", "_")}.pth')
    print(f"Best model updated at epoch {best_epoch + 1} with val acc: {best_val_metric:.4f}")

Epoch [1/3], Loss: 1.9417
Epoch [1/3], Val Loss: 1.9310, Val Acc: 0.1250
Best model updated at epoch 1 with val acc: 0.1250
Epoch [2/3], Loss: 1.8607
Epoch [2/3], Val Loss: 1.9246, Val Acc: 0.1562
Best model updated at epoch 2 with val acc: 0.1562
Epoch [3/3], Loss: 1.7995
Epoch [3/3], Val Loss: 1.9185, Val Acc: 0.1875
Best model updated at epoch 3 with val acc: 0.1875


In [16]:
# Load the best model
model.load_state_dict(torch.load(ARTIFACTS_FOLDER / f'best_{model_card.split("/")[-1].replace("-", "_")}.pth'))

# --- 4. TESTING LOOP ---
model.eval()
test_loss, correct, total = 0.0, 0, 0
y_pred = []
y_true = []

# Compute the inference time
start_time = time()
with torch.no_grad():
    for inputs, labels in test_loader:
        # Move images and labels to the device
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = model(inputs).logits
        test_loss   += criterion(outputs, labels).item() * inputs.size(0)
        preds      = outputs.argmax(dim=1)
        y_pred.extend(preds.cpu().numpy())
        y_true.extend(labels.cpu().numpy())
        correct   += (preds == labels).sum().item()
        total     += inputs.size(0)
end_time = time()
# Compute the inference time
inference_time = end_time - start_time
print(f"Inference time: {inference_time:.4f} seconds")
# Compute the average test loss and accuracy
avg_test_loss = test_loss / total
test_acc      = correct / total
print(f"Test Loss: {avg_test_loss:.4f}, Test Acc: {test_acc:.4f}")

Inference time: 1.5189 seconds
Test Loss: 1.9254, Test Acc: 0.1875


In [17]:
# Compute the confusion matrix
print(f"Confusion matrix: {pd.crosstab(pd.Series(y_true), pd.Series(y_pred), rownames=['True'], colnames=['Predicted'], margins=True)}")

Confusion matrix: Predicted  0  1  2  3  4  5  6  All
True                               
0          2  2  1  1  0  0  1    7
1          0  1  0  0  0  1  0    2
2          0  0  2  2  0  0  1    5
3          3  0  1  0  1  0  1    6
4          1  2  0  0  0  2  0    5
5          0  1  0  0  0  1  1    3
6          2  2  0  0  0  0  0    4
All        8  8  4  3  1  4  4   32


In [18]:
print(classification_report(y_true, y_pred, target_names=classes, zero_division=0))

                            precision    recall  f1-score   support

                 Baby Care       0.25      0.29      0.27         7
  Beauty and Personal Care       0.12      0.50      0.20         2
                 Computers       0.50      0.40      0.44         5
Home Decor & Festive Needs       0.00      0.00      0.00         6
           Home Furnishing       0.00      0.00      0.00         5
          Kitchen & Dining       0.25      0.33      0.29         3
                   Watches       0.00      0.00      0.00         4

                  accuracy                           0.19        32
                 macro avg       0.16      0.22      0.17        32
              weighted avg       0.16      0.19      0.17        32



## MOBILENETV2

In [None]:
"google/mobilenet_v2_1.0_224"

## **MAMBA S 1 K**

In [17]:
model = AutoModel.from_pretrained("nvidia/MambaVision-S-1K", trust_remote_code=True)

# eval mode for inference
model.cuda().eval()

# prepare image for the model
url = 'http://images.cocodataset.org/val2017/000000020247.jpg'
image = Image.open(requests.get(url, stream=True).raw)
input_resolution = (3, 224, 224)  # MambaVision supports any input resolutions

transform = create_transform(input_size=input_resolution,
                             is_training=False,
                             mean=model.config.mean,
                             std=model.config.std,
                             crop_mode=model.config.crop_mode,
                             crop_pct=model.config.crop_pct)
inputs = transform(image).unsqueeze(0).cuda()
# model inference
out_avg_pool, features = model(inputs)
print("Size of the averaged pool features:", out_avg_pool.size())  # torch.Size([1, 768])
print("Number of stages in extracted features:", len(features)) # 4 stages
print("Size of extracted features in stage 1:", features[0].size()) # torch.Size([1, 96, 56, 56])
print("Size of extracted features in stage 4:", features[3].size()) # torch.Size([1, 768, 7, 7])

Size of the averaged pool features: torch.Size([1, 768])
Number of stages in extracted features: 4
Size of extracted features in stage 1: torch.Size([1, 96, 56, 56])
Size of extracted features in stage 4: torch.Size([1, 768, 7, 7])


In [18]:
num_classes = 7

class MambaClassifier(nn.Module):
    def __init__(self, backbone, num_classes, hidden_dim=768): # Added hidden_dim
        super().__init__()
        self.backbone = backbone
        self.config = self.backbone.config
        # self.classifier = nn.Linear(640, num_classes) # Original classifier
        self.classifier = nn.Linear(hidden_dim, num_classes) # Modified classifier

    def forward(self, x):
        out_avg_pool, _ = self.backbone(x)
        logits = self.classifier(out_avg_pool)
        return logits


In [19]:
mamba = MambaClassifier(model, num_classes=num_classes).cuda().eval()

image = Image.open(img_test)

input_resolution = (3, 224, 224)  # MambaVision supports any input resolutions

transform = create_transform(input_size=input_resolution,
                             is_training=False,
                             mean=mamba.config.mean,
                             std=mamba.config.std,
                             crop_mode=mamba.config.crop_mode,
                             crop_pct=mamba.config.crop_pct)


inputs = transform(image).unsqueeze(0).cuda()
# Prédiction
with torch.no_grad():
    logits = mamba(inputs)
    probs = torch.softmax(logits, dim=-1)

print("Probabilités prédites:", probs)

Probabilités prédites: tensor([[0.1360, 0.1641, 0.1267, 0.1375, 0.1245, 0.1609, 0.1504]],
       device='cuda:0')


In [20]:
model = MambaClassifier(AutoModel.from_pretrained("nvidia/MambaVision-S-1K", trust_remote_code=True), num_classes=num_classes).cuda().train()

image = Image.open(img_test)

input_resolution = (3, 224, 224)  # MambaVision supports any input resolutions

transform = create_transform(input_size=input_resolution,
                             is_training=True,
                             mean=mamba.config.mean,
                             std=mamba.config.std,
                             crop_mode=mamba.config.crop_mode,
                             crop_pct=mamba.config.crop_pct)

dataset = ImageDataset(dataframe=train, image_dir=image_dir, transform=transform)

# 5. Create DataLoader
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

In [21]:
# Training loop for the model with the train dataloader
# Set the device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Move model to the device
model.to(device)
# Set the model to training mode
model.train()
# Set the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
# Set the number of epochs
num_epochs = 3
# Set the loss function
criterion = torch.nn.CrossEntropyLoss()
# Training loop
for epoch in range(num_epochs):
    running_loss = 0.0
    for images, labels in dataloader:
        # Move images and labels to the device
        images = images.to(device)
        labels = labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(images)
        
        # inputs = transform(image).unsqueeze(0).cuda()
        # model inference
        # outputs = model(inputs)
        # end_time = time()
        # logits = outputs["logits"]
        # predicted_class_idx = logits.argmax(-1).item()

        # Compute loss
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Print statistics
        running_loss += loss.item()
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(dataloader):.4f}")
# Save the model
# model.save_pretrained("mamba_model")
# processor.save_pretrained("mamba_processor")

Epoch [1/3], Loss: 1.9875
Epoch [2/3], Loss: 1.9445
Epoch [3/3], Loss: 1.9724


In [22]:
# Inference on test set
# Set the model to evaluation mode
model.eval()
# Initialize the test dataset
test_dataset = ImageDataset(dataframe=test, image_dir=image_dir, transform=transform)
# Initialize the test dataloader
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)
# Initialize the predictions list
predictions = []
# Initialize the labels list
labels_list = []
# Inference loop
for images, labels in test_dataloader:
    # Move images to the device
    images = images.to(device)
    # Forward pass
    with torch.no_grad():
        outputs = model(images)
    # Get the predicted labels
    _, preds = torch.max(outputs, 1)
    # Append the predictions and labels to the lists
    predictions.extend(preds.cpu().numpy())
    labels_list.extend(labels.cpu().numpy())
# Convert the predictions and labels to numpy arrays
predictions = np.array(predictions)
labels_list = np.array(labels_list)
# Print the classification report
print(classification_report(labels_list, predictions, target_names=classes))
# Save the predictions

                            precision    recall  f1-score   support

           Home Furnishing       0.00      0.00      0.00         4
                 Baby Care       0.00      0.00      0.00         5
                   Watches       0.00      0.00      0.00         4
Home Decor & Festive Needs       0.00      0.00      0.00         6
          Kitchen & Dining       0.40      0.40      0.40         5
  Beauty and Personal Care       0.10      0.20      0.13         5
                 Computers       0.00      0.00      0.00         3

                  accuracy                           0.09        32
                 macro avg       0.07      0.09      0.08        32
              weighted avg       0.08      0.09      0.08        32



## **MAMBA B 21 K**

In [23]:
model = MambaClassifier(AutoModel.from_pretrained("nvidia/MambaVision-B-21K", trust_remote_code=True), num_classes=num_classes, hidden_dim=1024).cuda().train()

image = Image.open(img_test)

input_resolution = (3, 224, 224)  # MambaVision supports any input resolutions

transform = create_transform(input_size=input_resolution,
                             is_training=True,
                             mean=mamba.config.mean,
                             std=mamba.config.std,
                             crop_mode=mamba.config.crop_mode,
                             crop_pct=mamba.config.crop_pct)

dataset = ImageDataset(dataframe=train, image_dir=image_dir, transform=transform)

# 5. Create DataLoader
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

In [24]:
# Training loop for the model with the train dataloader
# Set the device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Move model to the device
model.to(device)
# Set the model to training mode
model.train()
# Set the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
# Set the number of epochs
num_epochs = 3
# Set the loss function
criterion = torch.nn.CrossEntropyLoss()
# Training loop
for epoch in range(num_epochs):
    running_loss = 0.0
    for images, labels in dataloader:
        # Move images and labels to the device
        images = images.to(device)
        labels = labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(images)
        
        # inputs = transform(image).unsqueeze(0).cuda()
        # model inference
        # outputs = model(inputs)
        # end_time = time()
        # logits = outputs["logits"]
        # predicted_class_idx = logits.argmax(-1).item()

        # Compute loss
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Print statistics
        running_loss += loss.item()
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(dataloader):.4f}")
# Save the model
# model.save_pretrained("mamba_model")
# processor.save_pretrained("mamba_processor")

Epoch [1/3], Loss: 2.1026
Epoch [2/3], Loss: 2.0404
Epoch [3/3], Loss: 1.9177


In [25]:
# Inference on test set
# Set the model to evaluation mode
model.eval()
# Initialize the test dataset
test_dataset = ImageDataset(dataframe=test, image_dir=image_dir, transform=transform)
# Initialize the test dataloader
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)
# Initialize the predictions list
predictions = []
# Initialize the labels list
labels_list = []
# Inference loop
for images, labels in test_dataloader:
    # Move images to the device
    images = images.to(device)
    # Forward pass
    with torch.no_grad():
        outputs = model(images)
    # Get the predicted labels
    _, preds = torch.max(outputs, 1)
    # Append the predictions and labels to the lists
    predictions.extend(preds.cpu().numpy())
    labels_list.extend(labels.cpu().numpy())
# Convert the predictions and labels to numpy arrays
predictions = np.array(predictions)
labels_list = np.array(labels_list)
# Print the classification report
print(classification_report(labels_list, predictions, target_names=classes))
# Save the predictions

                            precision    recall  f1-score   support

           Home Furnishing       0.00      0.00      0.00         4
                 Baby Care       0.50      0.40      0.44         5
                   Watches       0.17      0.25      0.20         4
Home Decor & Festive Needs       0.20      0.17      0.18         6
          Kitchen & Dining       0.25      0.20      0.22         5
  Beauty and Personal Care       0.25      0.20      0.22         5
                 Computers       0.00      0.00      0.00         3

                  accuracy                           0.19        32
                 macro avg       0.20      0.17      0.18        32
              weighted avg       0.21      0.19      0.20        32

