- Difficultés à gérer
    - [ ] upskilling timm
    - [ ] Définir les paramètres de base du transform
    - [ ] Encdor les labels proprement

## Les versions de pytorch et tensorflow
----------------

Afin de trouver un compromis entre les versions de pytorch et tensorflow, il a été décidé d'utiliser les versions suivantes:
- Pytorch 2.6
- Tensorflow 2.15

La version de `pytorch==2.7` est incompatible avec `mamba_ssm` et soulève l'erreur suivante:
<div style="background-color:rgba(100, 100, 100, 0.1); padding:10px; border-radius:4px;">
<pre><code>
File ~/github/oc_p9/backend/.venv/lib/python3.11/site-packages/mamba_ssm/__init__.py:3
      1 __version__ = "2.2.4"
----> 3 from mamba_ssm.ops.selective_scan_interface import selective_scan_fn, mamba_inner_fn
      4 from mamba_ssm.modules.mamba_simple import Mamba
      5 from mamba_ssm.modules.mamba2 import Mamba2

File ~/github/oc_p9/backend/.venv/lib/python3.11/site-packages/mamba_ssm/ops/selective_scan_interface.py:18
     14     causal_conv1d_cuda = None
     16 from mamba_ssm.ops.triton.layer_norm import _layer_norm_fwd
---> 18 import selective_scan_cuda
     21 class SelectiveScanFn(torch.autograd.Function):
     23     @staticmethod
     24     def forward(ctx, u, delta, A, B, C, D=None, z=None, delta_bias=None, delta_softplus=False,
     25                 return_last_state=False):

ImportError: /home/hedredo/github/oc_p9/backend/.venv/lib/python3.11/site-packages/selective_scan_cuda.cpython-311-x86_64-linux-gnu.so: undefined symbol: _ZN3c107WarningC1ESt7variantIJNS0_11UserWarningENS0_18DeprecationWarningEEERKNS_14SourceLocationESsb
</code></pre>
</div>

Enfin la version la plus récente `tensorflow==2.19` est incompatible avec la dépendance `nvidia-cudnn-cu12==9.1` de `torch==2.6`.<br>
Pour cette raison, la version de `tensorflow==2.15` a été choisie en raison de sa compatibilité avec la dépendance.<br>

In [3]:
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

import warnings
from time import time
from transformers import AutoModelForImageClassification, AutoImageProcessor, AutoModel
import requests
from timm.data.transforms_factory import create_transform
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from torchvision import transforms
import numpy as np
import torch.nn as nn
import matplotlib.pyplot as plt


In [4]:
import numpy as np
# display numpy version
print("Numpy version:", np.__version__)

Numpy version: 2.2.5


In [5]:
# Désactiver les alertes de FutureWarning
warnings.filterwarnings("ignore", category=FutureWarning)

In [7]:
# Print the torch version
print(torch.__version__)
# Print the CUDA version
print(torch.version.cuda)
# Print the cudnn version
print(torch.backends.cudnn.version())

2.6.0+cu124
12.4
90100


In [8]:
img_test = "/home/hedredo/github/oc_p9/data/images/2aaa6083689193df5ab01fe37dea1b5e.jpg"
url = "http://images.cocodataset.org/val2017/000000020247.jpg"
model_name_or_path = "google/vit-base-patch16-224-in21k" # or "nvidia/MambaVision-T-1K"
models = [
    "nvidia/MambaVision-T-1K",
    "nvidia/MambaVision-T2-1K",
    "nvidia/MambaVision-S-1K"
]

[COPY] Récupération des paramètres de train-test-split sur le jeu utilisé

In [9]:
image_df = pd.read_pickle('/home/hedredo/github/oc_p9/data/dataset_cleaned.pkl')
image_df.drop(columns=['product_name', 'description'], inplace=True)
# Assigne le nombre de classes
n_classes = image_df['class'].nunique()

# Assigne la liste des classes
classes = list(image_df['class'].unique())

# Encode les labels
label_encoder = LabelEncoder()
image_df['class'] = label_encoder.fit_transform(image_df['class'])
print(image_df.info())
# train test split avec un jeu de test de 20%
X_train, X_test, y_train, y_test = train_test_split(image_df['image'], image_df['class'], test_size=0.2, random_state=314, stratify=image_df['class'])

# Regroupement des données en dataframe de train et de test
train = pd.concat([X_train, y_train], axis=1).sample(32)
test = pd.concat([X_test, y_test], axis=1).sample(32)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1050 entries, 0 to 1049
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   image   1050 non-null   object
 1   class   1050 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 16.5+ KB
None


In [10]:
train.head(5)

Unnamed: 0,image,class
922,486e98154514ed485f0b2f9bc9f24549.jpg,3
372,ea82cb68a6e79d3c10fe9c4255c6a508.jpg,1
1003,0c70a580d96e50966130e2885d8c3505.jpg,5
365,7bcabeb38f148041087fafdad40e2c57.jpg,1
211,e99d9abb115a9bd580bcccf9ff4d4881.jpg,3


In [11]:
# image dir
image_dir = '/home/hedredo/github/oc_p9/data/images/'

In [12]:
# Block DataLoader
class ImageDataset(Dataset):
    def __init__(self, dataframe, image_dir, processor=None, transform=None):
        """
        Args:
            dataframe (pd.DataFrame): DataFrame containing image file names and labels.
            image_dir (str): Directory where images are stored.
            processor (AutoImageProcessor, optional):  Hugging Face processor for image preprocessing. Defaults to None.
            transform (callable, optional): Optional transform to be applied on a sample. Defaults to None.
        """
        self.dataframe = dataframe
        self.image_dir = image_dir
        self.processor = processor
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_name = os.path.join(self.image_dir, self.dataframe.iloc[idx, 0])  # Assuming image file names are in the first column
        image = Image.open(img_name).convert('RGB')  # Ensure consistent color format

        label = self.dataframe.iloc[idx, 1]  # Assuming labels are in the second column

        if self.processor:
            inputs = self.processor(images=image, return_tensors="pt")
            image = inputs['pixel_values'].squeeze()  # Remove batch dimension
        elif self.transform:
            image = self.transform(image)

        # Convert label to tensor
        label = torch.tensor(label)

        return image, label

# **EXPERIMENTS**

## **Google VIT**

In [13]:
# Set the classification head with 7 classes
model_name_or_path = "google/vit-base-patch16-224-in21k" # or "nvidia/MambaVision-T-1K"
model = AutoModelForImageClassification.from_pretrained(
        model_name_or_path, num_labels=7, trust_remote_code=True
    )

# Block Transform

# If you don't want to use a processor, you can use transforms
transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

# Block processor 

# 3. Initialize processor/transforms
processor = AutoImageProcessor.from_pretrained(model_name_or_path) # Use if you want to use a processor

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


In [14]:
dataset = ImageDataset(dataframe=train, image_dir=image_dir, processor=processor)
dataset[0]

# 5. Create DataLoader
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# Example of iterating through the dataloader
for images, labels in dataloader:
    print(f"Image batch shape: {images.shape}")
    print(f"Labels batch shape: {labels.shape}")
    break

Image batch shape: torch.Size([8, 3, 224, 224])
Labels batch shape: torch.Size([8])


In [15]:
# Training loop for the model with the train dataloader
# Set the device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Move model to the device
model.to(device)
# Set the model to training mode
model.train()
# Set the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
# Set the number of epochs
num_epochs = 3
# Set the loss function
criterion = torch.nn.CrossEntropyLoss()
# Training loop
for epoch in range(num_epochs):
    running_loss = 0.0
    for images, labels in dataloader:
        # Move images and labels to the device
        images = images.to(device)
        labels = labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(images).logits

        # Compute loss
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Print statistics
        running_loss += loss.item()
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(dataloader):.4f}")
# Save the model
# model.save_pretrained("mamba_model")
# processor.save_pretrained("mamba_processor")

Epoch [1/3], Loss: 1.9564
Epoch [2/3], Loss: 1.8762
Epoch [3/3], Loss: 1.8195


In [16]:
# Inference on test set
# Set the model to evaluation mode
model.eval()
# Initialize the processor
processor = AutoImageProcessor.from_pretrained(model_name_or_path)
# Initialize the test dataset
test_dataset = ImageDataset(dataframe=test, image_dir=image_dir, processor=processor)
# Initialize the test dataloader
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)
# Initialize the predictions list
predictions = []
# Initialize the labels list
labels_list = []
# Inference loop
for images, labels in test_dataloader:
    # Move images to the device
    images = images.to(device)
    # Forward pass
    with torch.no_grad():
        outputs = model(images).logits
    # Get the predicted labels
    _, preds = torch.max(outputs, 1)
    # Append the predictions and labels to the lists
    predictions.extend(preds.cpu().numpy())
    labels_list.extend(labels.cpu().numpy())
# Convert the predictions and labels to numpy arrays
predictions = np.array(predictions)
labels_list = np.array(labels_list)
# Print the classification report
print(classification_report(labels_list, predictions, target_names=classes))
# Save the predictions

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


                            precision    recall  f1-score   support

           Home Furnishing       0.00      0.00      0.00         4
                 Baby Care       0.00      0.00      0.00         5
                   Watches       0.00      0.00      0.00         4
Home Decor & Festive Needs       0.60      0.50      0.55         6
          Kitchen & Dining       0.25      0.60      0.35         5
  Beauty and Personal Care       0.00      0.00      0.00         5
                 Computers       0.75      1.00      0.86         3

                  accuracy                           0.28        32
                 macro avg       0.23      0.30      0.25        32
              weighted avg       0.22      0.28      0.24        32



## **MAMBA S 1 K**

In [17]:
model = AutoModel.from_pretrained("nvidia/MambaVision-S-1K", trust_remote_code=True)

# eval mode for inference
model.cuda().eval()

# prepare image for the model
url = 'http://images.cocodataset.org/val2017/000000020247.jpg'
image = Image.open(requests.get(url, stream=True).raw)
input_resolution = (3, 224, 224)  # MambaVision supports any input resolutions

transform = create_transform(input_size=input_resolution,
                             is_training=False,
                             mean=model.config.mean,
                             std=model.config.std,
                             crop_mode=model.config.crop_mode,
                             crop_pct=model.config.crop_pct)
inputs = transform(image).unsqueeze(0).cuda()
# model inference
out_avg_pool, features = model(inputs)
print("Size of the averaged pool features:", out_avg_pool.size())  # torch.Size([1, 768])
print("Number of stages in extracted features:", len(features)) # 4 stages
print("Size of extracted features in stage 1:", features[0].size()) # torch.Size([1, 96, 56, 56])
print("Size of extracted features in stage 4:", features[3].size()) # torch.Size([1, 768, 7, 7])

Size of the averaged pool features: torch.Size([1, 768])
Number of stages in extracted features: 4
Size of extracted features in stage 1: torch.Size([1, 96, 56, 56])
Size of extracted features in stage 4: torch.Size([1, 768, 7, 7])


In [18]:
num_classes = 7

class MambaClassifier(nn.Module):
    def __init__(self, backbone, num_classes, hidden_dim=768): # Added hidden_dim
        super().__init__()
        self.backbone = backbone
        self.config = self.backbone.config
        # self.classifier = nn.Linear(640, num_classes) # Original classifier
        self.classifier = nn.Linear(hidden_dim, num_classes) # Modified classifier

    def forward(self, x):
        out_avg_pool, _ = self.backbone(x)
        logits = self.classifier(out_avg_pool)
        return logits


In [19]:
mamba = MambaClassifier(model, num_classes=num_classes).cuda().eval()

image = Image.open(img_test)

input_resolution = (3, 224, 224)  # MambaVision supports any input resolutions

transform = create_transform(input_size=input_resolution,
                             is_training=False,
                             mean=mamba.config.mean,
                             std=mamba.config.std,
                             crop_mode=mamba.config.crop_mode,
                             crop_pct=mamba.config.crop_pct)


inputs = transform(image).unsqueeze(0).cuda()
# Prédiction
with torch.no_grad():
    logits = mamba(inputs)
    probs = torch.softmax(logits, dim=-1)

print("Probabilités prédites:", probs)

Probabilités prédites: tensor([[0.1360, 0.1641, 0.1267, 0.1375, 0.1245, 0.1609, 0.1504]],
       device='cuda:0')


In [20]:
model = MambaClassifier(AutoModel.from_pretrained("nvidia/MambaVision-S-1K", trust_remote_code=True), num_classes=num_classes).cuda().train()

image = Image.open(img_test)

input_resolution = (3, 224, 224)  # MambaVision supports any input resolutions

transform = create_transform(input_size=input_resolution,
                             is_training=True,
                             mean=mamba.config.mean,
                             std=mamba.config.std,
                             crop_mode=mamba.config.crop_mode,
                             crop_pct=mamba.config.crop_pct)

dataset = ImageDataset(dataframe=train, image_dir=image_dir, transform=transform)

# 5. Create DataLoader
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

In [21]:
# Training loop for the model with the train dataloader
# Set the device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Move model to the device
model.to(device)
# Set the model to training mode
model.train()
# Set the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
# Set the number of epochs
num_epochs = 3
# Set the loss function
criterion = torch.nn.CrossEntropyLoss()
# Training loop
for epoch in range(num_epochs):
    running_loss = 0.0
    for images, labels in dataloader:
        # Move images and labels to the device
        images = images.to(device)
        labels = labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(images)
        
        # inputs = transform(image).unsqueeze(0).cuda()
        # model inference
        # outputs = model(inputs)
        # end_time = time()
        # logits = outputs["logits"]
        # predicted_class_idx = logits.argmax(-1).item()

        # Compute loss
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Print statistics
        running_loss += loss.item()
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(dataloader):.4f}")
# Save the model
# model.save_pretrained("mamba_model")
# processor.save_pretrained("mamba_processor")

Epoch [1/3], Loss: 1.9875
Epoch [2/3], Loss: 1.9445
Epoch [3/3], Loss: 1.9724


In [22]:
# Inference on test set
# Set the model to evaluation mode
model.eval()
# Initialize the test dataset
test_dataset = ImageDataset(dataframe=test, image_dir=image_dir, transform=transform)
# Initialize the test dataloader
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)
# Initialize the predictions list
predictions = []
# Initialize the labels list
labels_list = []
# Inference loop
for images, labels in test_dataloader:
    # Move images to the device
    images = images.to(device)
    # Forward pass
    with torch.no_grad():
        outputs = model(images)
    # Get the predicted labels
    _, preds = torch.max(outputs, 1)
    # Append the predictions and labels to the lists
    predictions.extend(preds.cpu().numpy())
    labels_list.extend(labels.cpu().numpy())
# Convert the predictions and labels to numpy arrays
predictions = np.array(predictions)
labels_list = np.array(labels_list)
# Print the classification report
print(classification_report(labels_list, predictions, target_names=classes))
# Save the predictions

                            precision    recall  f1-score   support

           Home Furnishing       0.00      0.00      0.00         4
                 Baby Care       0.00      0.00      0.00         5
                   Watches       0.00      0.00      0.00         4
Home Decor & Festive Needs       0.00      0.00      0.00         6
          Kitchen & Dining       0.40      0.40      0.40         5
  Beauty and Personal Care       0.10      0.20      0.13         5
                 Computers       0.00      0.00      0.00         3

                  accuracy                           0.09        32
                 macro avg       0.07      0.09      0.08        32
              weighted avg       0.08      0.09      0.08        32



## **MAMBA B 21 K**

In [23]:
model = MambaClassifier(AutoModel.from_pretrained("nvidia/MambaVision-B-21K", trust_remote_code=True), num_classes=num_classes, hidden_dim=1024).cuda().train()

image = Image.open(img_test)

input_resolution = (3, 224, 224)  # MambaVision supports any input resolutions

transform = create_transform(input_size=input_resolution,
                             is_training=True,
                             mean=mamba.config.mean,
                             std=mamba.config.std,
                             crop_mode=mamba.config.crop_mode,
                             crop_pct=mamba.config.crop_pct)

dataset = ImageDataset(dataframe=train, image_dir=image_dir, transform=transform)

# 5. Create DataLoader
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

In [24]:
# Training loop for the model with the train dataloader
# Set the device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Move model to the device
model.to(device)
# Set the model to training mode
model.train()
# Set the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
# Set the number of epochs
num_epochs = 3
# Set the loss function
criterion = torch.nn.CrossEntropyLoss()
# Training loop
for epoch in range(num_epochs):
    running_loss = 0.0
    for images, labels in dataloader:
        # Move images and labels to the device
        images = images.to(device)
        labels = labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(images)
        
        # inputs = transform(image).unsqueeze(0).cuda()
        # model inference
        # outputs = model(inputs)
        # end_time = time()
        # logits = outputs["logits"]
        # predicted_class_idx = logits.argmax(-1).item()

        # Compute loss
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Print statistics
        running_loss += loss.item()
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {running_loss / len(dataloader):.4f}")
# Save the model
# model.save_pretrained("mamba_model")
# processor.save_pretrained("mamba_processor")

Epoch [1/3], Loss: 2.1026
Epoch [2/3], Loss: 2.0404
Epoch [3/3], Loss: 1.9177


In [25]:
# Inference on test set
# Set the model to evaluation mode
model.eval()
# Initialize the test dataset
test_dataset = ImageDataset(dataframe=test, image_dir=image_dir, transform=transform)
# Initialize the test dataloader
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)
# Initialize the predictions list
predictions = []
# Initialize the labels list
labels_list = []
# Inference loop
for images, labels in test_dataloader:
    # Move images to the device
    images = images.to(device)
    # Forward pass
    with torch.no_grad():
        outputs = model(images)
    # Get the predicted labels
    _, preds = torch.max(outputs, 1)
    # Append the predictions and labels to the lists
    predictions.extend(preds.cpu().numpy())
    labels_list.extend(labels.cpu().numpy())
# Convert the predictions and labels to numpy arrays
predictions = np.array(predictions)
labels_list = np.array(labels_list)
# Print the classification report
print(classification_report(labels_list, predictions, target_names=classes))
# Save the predictions

                            precision    recall  f1-score   support

           Home Furnishing       0.00      0.00      0.00         4
                 Baby Care       0.50      0.40      0.44         5
                   Watches       0.17      0.25      0.20         4
Home Decor & Festive Needs       0.20      0.17      0.18         6
          Kitchen & Dining       0.25      0.20      0.22         5
  Beauty and Personal Care       0.25      0.20      0.22         5
                 Computers       0.00      0.00      0.00         3

                  accuracy                           0.19        32
                 macro avg       0.20      0.17      0.18        32
              weighted avg       0.21      0.19      0.20        32

