# Clothes classification

## Vision Language model

### Python inference test

In [1]:
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel

# Load the model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

  from .autonotebook import tqdm as notebook_tqdm


In [23]:
# Load your image and possible category descriptions
image = Image.open("img_test/T-shirt.jpg")
categories = ["a photo of a dress", "a photo of a t-shirt", "a photo of pants", "a photo of a jacket", "a photo of underwear", "a photo of shoes", "a photo of hat", "a photo of a pullover"]

# Preprocess the image and text for CLIP
inputs = processor(text=categories, images=image, return_tensors="pt", padding=True)

# Perform classification
outputs = model(**inputs)
logits_per_image = outputs.logits_per_image  # Image-text similarity score
probs = logits_per_image.softmax(dim=1)  # Convert to probabilities

# Convert the tensor to a list
probs_list = probs.squeeze().tolist()  # Remove batch dimension and convert to list

categories_fr = ["Robe", "T-shirt", "Pantalon", "Veste", "Sous-vêtements", "Chaussures", "Chapeau", "Pull"]

# Create a dictionary to map categories to their corresponding probabilities
category_prob_dict = dict(zip(categories_fr, probs_list))

# Print the dictionary
print("Category probabilities:")
for category, prob in category_prob_dict.items():
    print(f"{category}: {prob:.4f}")

# Find the category with the highest probability
most_probable_category = max(category_prob_dict, key=category_prob_dict.get)
print(f"\nThe most probable category is: {most_probable_category}")

Category probabilities:
Robe: 0.0001
T-shirt: 0.9947
Pantalon: 0.0004
Veste: 0.0002
Sous-vêtements: 0.0005
Chaussures: 0.0000
Chapeau: 0.0000
Pull: 0.0041

The most probable category is: T-shirt


### Export to ONNX format for deployment

In [6]:
torch.onnx.export(model, (inputs['input_ids'], inputs['pixel_values']), "clip_clothes_classification.onnx", 
                  input_names=["input_ids", "pixel_values"],
                  output_names=["logits_per_image", "logits_per_text"],
                  dynamic_axes={'input_ids': {0: 'batch_size'}, 'pixel_values': {0: 'batch_size'}}, opset_version=20)

  if input_shape[-1] > 1 or self.sliding_window is not None:
  if past_key_values_length > 0:


### Quantize the model to reduce its size

In [12]:
from onnxruntime.quantization import quantize_dynamic, QuantType

# Apply dynamic quantization
quantized_model_path = "clip_clothes_classification_quant.onnx"
quantize_dynamic(
    "clip_clothes_classification.onnx",
    quantized_model_path,
    weight_type=QuantType.QUInt8
)
print(f"Dynamic quantization completed. Quantized model saved at {quantized_model_path}")



Dynamic quantization completed. Quantized model saved at clip_clothes_classification_quant.onnx


### Test the ONNX model with ONNXRuntime

In [25]:
import onnxruntime as ort


ort_session = ort.InferenceSession("clip_clothes_classification_quant.onnx")

input_ids = inputs['input_ids'].numpy()
pixel_values = inputs['pixel_values'].numpy()

# Run the inference
outputs = ort_session.run(None, {
    "input_ids": input_ids,
    "pixel_values": pixel_values,
})

logits = torch.Tensor(outputs[0]).softmax(dim=1)

# Convert the tensor to a list
probs_list = logits.squeeze().tolist()  # Remove batch dimension and convert to list

categories_fr = ["Robe", "T-shirt", "Pantalon", "Veste", "Sous-vêtements", "Chaussures", "Chapeau", "Pull"]

# Create a dictionary to map categories to their corresponding probabilities
category_prob_dict = dict(zip(categories_fr, probs_list))

# Print the dictionary
print("Category probabilities:")
for category, prob in category_prob_dict.items():
    print(f"{category}: {prob:.4f}")

# Find the category with the highest probability
most_probable_category = max(category_prob_dict, key=category_prob_dict.get)
print(f"\nThe most probable category is: {most_probable_category}")

Category probabilities:
Robe: 0.0047
T-shirt: 0.9627
Pantalon: 0.0053
Veste: 0.0057
Sous-vêtements: 0.0084
Chaussures: 0.0005
Chapeau: 0.0064
Pull: 0.0063

The most probable category is: T-shirt


### Export the input_ids (tokenized categories)

We need to export the tokenized categories so we can use it in the flutter app as input for the onnx model since the model does not include the tokenizer.

#### Dart list

In [12]:
# Format the list as a Dart array string
dart_array_str = f"const List<List<int>> inputIds = {input_ids};"

# Save it to a Dart file (optional)
with open("input_ids.dart", "w") as dart_file:
    dart_file.write(dart_array_str)

print("Dart array saved to input_ids.dart")

Dart array saved to input_ids.dart
