# Clothes classification

## Apple MobileCLIP model

https://huggingface.co/apple/MobileCLIP-S1-OpenCLIP

Similar to OpenAI CLIP model but lighter and more performant (therefore more suitable for mobile deployment).

### Python inference test

Load the model

In [28]:
import torch
from PIL import Image
import open_clip  # OpenCLIP library

# Load the MobileCLIP-S1 model and tokenizer
model, _, processor = open_clip.create_model_and_transforms('MobileCLIP-S1', pretrained='datacompdr')
model.eval()

tokenizer = open_clip.get_tokenizer('MobileCLIP-S1')

Perform inference

In [82]:
# Load and preprocess the image and text categories
image = Image.open("img_test/pull.jpg")
categories = [
    "a photo of a dress", "a photo of a t-shirt", "a photo of pants",
    "a photo of a jacket", "a photo of underwear", "a photo of shoes",
    "a photo of hat", "a photo of a pullover"
]

# Tokenize text inputs (using OpenCLIP tokenizer)

text_inputs = tokenizer(categories)

# Preprocess the image using OpenCLIP processor
image_input = processor(image).unsqueeze(0)  # Add batch dimension

# Move model and inputs to the appropriate device (e.g., GPU if available)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
image_input = image_input.to(device)
text_inputs = text_inputs.to(device)

# Perform classification
with torch.no_grad():
    image_features = model.encode_image(image_input)
    text_features = model.encode_text(text_inputs)

    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)


    # Calculate similarity scores
    logits_per_image = (100 * image_features @ text_features.T).softmax(dim=-1)  # Image-text similarity score
    probs = logits_per_image.cpu().squeeze().tolist()  # Convert to probabilities

# Map categories to probabilities (e.g., for French categories)
categories_fr = ["Robe", "T-shirt", "Pantalon", "Veste", "Sous-vêtements", "Chaussures", "Chapeau", "Pull"]
category_prob_dict = dict(zip(categories_fr, probs))

# Print category probabilities
print("Category probabilities:")
for category, prob in category_prob_dict.items():
    print(f"{category}: {prob:.4f}")

# Find the category with the highest probability
most_probable_category = max(category_prob_dict, key=category_prob_dict.get)
print(f"\nThe most probable category is: {most_probable_category}")

Category probabilities:
Robe: 0.0000
T-shirt: 0.0000
Pantalon: 0.0000
Veste: 0.0000
Sous-vêtements: 0.0000
Chaussures: 0.0000
Chapeau: 0.0000
Pull: 1.0000

The most probable category is: Pull


### Export to ONNX format for deployment

In [112]:
torch.onnx.export(
    model,
    (image_input, text_inputs.to(torch.int32)),
    "mobileclip_s1.onnx",
    input_names=["pixel_values", "input_ids"],
    output_names=["logits_per_image", "logits_per_text"],
    dynamic_axes={'input_ids': {0: 'batch_size'}, 'pixel_values': {0: 'batch_size'}},
    opset_version=20
)



### Quantize the model to reduce its size

In [121]:
from onnxruntime.quantization import quantize_dynamic, QuantType, quant_pre_process

# Apply dynamic quantization
quantized_model_path = "mobileclip_s1_quant.onnx"
quantize_dynamic(
    "mobileclip_s1.onnx",
    quantized_model_path,
    weight_type=QuantType.QUInt8,
)
print(f"Dynamic quantization completed. Quantized model saved at {quantized_model_path}")



Dynamic quantization completed. Quantized model saved at mobileclip_s1_quant.onnx


### Test inference on ONNXRuntime

In [106]:
# Load and preprocess the image and text categories
image = Image.open("img_test/pant.jpg")
categories = [
    "a photo of a dress", "a photo of a t-shirt", "a photo of pants",
    "a photo of a jacket", "a photo of underwear", "a photo of shoes",
    "a photo of hat", "a photo of a pullover"
]

# Tokenize text inputs (using OpenCLIP tokenizer)

text_inputs = tokenizer(categories)

# Preprocess the image using OpenCLIP processor
image_input = processor(image).unsqueeze(0)  # Add batch dimension

In [122]:
import onnxruntime as ort
import numpy as np
import torch

# Load the ONNX model
onnx_model_path = "mobileclip_s1_quant.onnx"
ort_session = ort.InferenceSession(onnx_model_path)

# Prepare inputs for ONNXRuntime (convert to numpy arrays)
image_input_np = image_input.cpu().numpy()  # Image input from preprocessing
text_inputs_np = text_inputs.cpu().numpy()  # Text input from tokenization

# Run inference using ONNXRuntime
ort_inputs = {
    "pixel_values": image_input_np,
    "input_ids": text_inputs_np.astype(np.int32)
}
ort_outputs = ort_session.run(None, ort_inputs)

# Extract the embeddings (image and text features)
image_features = ort_outputs[0]  # Shape: (1, 512)
text_features = ort_outputs[1]  # Shape: (8, 512)

# Convert to torch tensors for further manipulation
image_features = torch.tensor(image_features)
text_features = torch.tensor(text_features)

# Normalize the features (same as in PyTorch model)
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
text_features = text_features / text_features.norm(dim=-1, keepdim=True)

# Compute the similarity (dot product) between image and text embeddings
similarity = (image_features @ text_features.T) * 100  # Shape: (1, 8)
probs = similarity.softmax(dim=-1).squeeze().tolist()  # Apply softmax to get probabilities

# Map categories to probabilities (same as PyTorch)
categories_fr = ["Robe", "T-shirt", "Pantalon", "Veste", "Sous-vêtements", "Chaussures", "Chapeau", "Pull"]
category_prob_dict = dict(zip(categories_fr, probs))

# Print category probabilities
print("Category probabilities (ONNXRuntime):")
for category, prob in category_prob_dict.items():
    print(f"{category}: {prob:.4f}")

# Find the category with the highest probability
most_probable_category = max(category_prob_dict, key=category_prob_dict.get)
print(f"\nThe most probable category is: {most_probable_category}")


Category probabilities (ONNXRuntime):
Robe: 0.1033
T-shirt: 0.0530
Pantalon: 0.0461
Veste: 0.0681
Sous-vêtements: 0.0066
Chaussures: 0.0044
Chapeau: 0.2401
Pull: 0.4784

The most probable category is: Pull


## OpenAI CLIP model

https://huggingface.co/openai/clip-vit-base-patch32

### Python inference test

In [1]:
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel

# Load the model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

  from .autonotebook import tqdm as notebook_tqdm


In [23]:
# Load your image and possible category descriptions
image = Image.open("img_test/T-shirt.jpg")
categories = ["a photo of a dress", "a photo of a t-shirt", "a photo of pants", "a photo of a jacket", "a photo of underwear", "a photo of shoes", "a photo of hat", "a photo of a pullover"]

# Preprocess the image and text for CLIP
inputs = processor(text=categories, images=image, return_tensors="pt", padding=True)

# Perform classification
outputs = model(**inputs)
logits_per_image = outputs.logits_per_image  # Image-text similarity score
probs = logits_per_image.softmax(dim=1)  # Convert to probabilities

# Convert the tensor to a list
probs_list = probs.squeeze().tolist()  # Remove batch dimension and convert to list

categories_fr = ["Robe", "T-shirt", "Pantalon", "Veste", "Sous-vêtements", "Chaussures", "Chapeau", "Pull"]

# Create a dictionary to map categories to their corresponding probabilities
category_prob_dict = dict(zip(categories_fr, probs_list))

# Print the dictionary
print("Category probabilities:")
for category, prob in category_prob_dict.items():
    print(f"{category}: {prob:.4f}")

# Find the category with the highest probability
most_probable_category = max(category_prob_dict, key=category_prob_dict.get)
print(f"\nThe most probable category is: {most_probable_category}")

Category probabilities:
Robe: 0.0001
T-shirt: 0.9947
Pantalon: 0.0004
Veste: 0.0002
Sous-vêtements: 0.0005
Chaussures: 0.0000
Chapeau: 0.0000
Pull: 0.0041

The most probable category is: T-shirt


### Export to ONNX format for deployment

In [6]:
torch.onnx.export(model, (inputs['input_ids'], inputs['pixel_values']), "clip_clothes_classification.onnx", 
                  input_names=["input_ids", "pixel_values"],
                  output_names=["logits_per_image", "logits_per_text"],
                  dynamic_axes={'input_ids': {0: 'batch_size'}, 'pixel_values': {0: 'batch_size'}}, opset_version=20)

  if input_shape[-1] > 1 or self.sliding_window is not None:
  if past_key_values_length > 0:


### Quantize the model to reduce its size

In [12]:
from onnxruntime.quantization import quantize_dynamic, QuantType

# Apply dynamic quantization
quantized_model_path = "clip_clothes_classification_quant.onnx"
quantize_dynamic(
    "clip_clothes_classification.onnx",
    quantized_model_path,
    weight_type=QuantType.QUInt8
)
print(f"Dynamic quantization completed. Quantized model saved at {quantized_model_path}")



Dynamic quantization completed. Quantized model saved at clip_clothes_classification_quant.onnx


### Test the ONNX model with ONNXRuntime

In [25]:
import onnxruntime as ort


ort_session = ort.InferenceSession("clip_clothes_classification_quant.onnx")

input_ids = inputs['input_ids'].numpy()
pixel_values = inputs['pixel_values'].numpy()

# Run the inference
outputs = ort_session.run(None, {
    "input_ids": input_ids,
    "pixel_values": pixel_values,
})

logits = torch.Tensor(outputs[0]).softmax(dim=1)

# Convert the tensor to a list
probs_list = logits.squeeze().tolist()  # Remove batch dimension and convert to list

categories_fr = ["Robe", "T-shirt", "Pantalon", "Veste", "Sous-vêtements", "Chaussures", "Chapeau", "Pull"]

# Create a dictionary to map categories to their corresponding probabilities
category_prob_dict = dict(zip(categories_fr, probs_list))

# Print the dictionary
print("Category probabilities:")
for category, prob in category_prob_dict.items():
    print(f"{category}: {prob:.4f}")

# Find the category with the highest probability
most_probable_category = max(category_prob_dict, key=category_prob_dict.get)
print(f"\nThe most probable category is: {most_probable_category}")

Category probabilities:
Robe: 0.0047
T-shirt: 0.9627
Pantalon: 0.0053
Veste: 0.0057
Sous-vêtements: 0.0084
Chaussures: 0.0005
Chapeau: 0.0064
Pull: 0.0063

The most probable category is: T-shirt


### Export the input_ids (tokenized categories)

We need to export the tokenized categories so we can use it in the flutter app as input for the onnx model since the model does not include the tokenizer.

#### Dart list

In [12]:
# Format the list as a Dart array string
dart_array_str = f"const List<List<int>> inputIds = {input_ids};"

# Save it to a Dart file (optional)
with open("input_ids.dart", "w") as dart_file:
    dart_file.write(dart_array_str)

print("Dart array saved to input_ids.dart")

Dart array saved to input_ids.dart
