In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import numpy as np
from PIL import Image

## Loading the Model from Hugging Face

In [2]:
from transformers import AutoImageProcessor, AutoModelForSemanticSegmentation
from PIL import Image
import torch

processor = AutoImageProcessor.from_pretrained("matei-dorian/segformer-b5-finetuned-human-parsing");
model = AutoModelForSemanticSegmentation.from_pretrained("matei-dorian/segformer-b5-finetuned-human-parsing");

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
  image_processor = cls(**image_processor_dict)


### Loading the Example Image for Segmentation

In [3]:
img_path = "D:\Code Playground\wardrob-aI\data\my_photos\person_02.png"
image = Image.open(img_path).convert("RGB")

#### Doing all the Pre-processing stuffs and sending the input to the Model

In [4]:
inputs = processor(images=image, return_tensors="pt") # All pre-processing stuffs to the example image
outputs = model(**inputs) # Sending input image to the Model

In [5]:
logits = outputs.logits

### Resizes the output image back to the original size

##### Step-by-step:
- Takes the low-resolution segmentation logits
- Upscales them (resizes) back to full image resolution
- Uses bilinear interpolation for smooth resizing

In [6]:
upsampled_logits = torch.nn.functional.interpolate(
    logits,
    size=image.size[::-1],  # (height, width)
    mode="bilinear",
    align_corners=False
)
pred_mask = upsampled_logits.argmax(dim=1)[0].numpy()

In [7]:
num_classes = logits.shape[1]
print("Total classes:", num_classes)
unique_ids = np.unique(pred_mask)
print("Classes present in image:", unique_ids)

Total classes: 18
Classes present in image: [ 0  2  4  6  9 10 11 12 14 15]


### Defining stable color palette for ATR 18 classes

In [None]:
import numpy as np
from PIL import Image

# Predefined stable color palette for ATR 18 classes
ATR_COLORS = {
    0:  (0, 0, 0),          # Background - Black
    1:  (128, 0, 0),        # Hat
    2:  (255, 0, 0),        # Hair
    3:  (255, 255, 0),      # Sunglasses
    4:  (0, 128, 0),        # Upper-clothes
    5:  (0, 255, 0),        # Skirt
    6:  (0, 0, 128),        # Pants
    7:  (0, 0, 255),        # Dress
    8:  (128, 128, 0),      # Belt
    9:  (128, 0, 128),      # Left-shoe
    10: (255, 0, 255),      # Right-shoe
    11: (255, 200, 150),    # Face (skin tone)
    12: (150, 150, 255),    # Left-leg
    13: (180, 180, 255),    # Right-leg
    14: (255, 180, 180),    # Left-arm
    15: (255, 150, 150),    # Right-arm
    16: (0, 150, 150),      # Bag
    17: (0, 255, 255),      # Scarf
}


def visualize_mask(mask, save_path="D:/Code Playground/wardrob-ai/experiments/output/masks/parsed_output.png"):
    """
    Convert class mask → color mask using predefined ATR color palette
    """
    os.makedirs(os.path.dirname(save_path), exist_ok=True)
    H, W = mask.shape
    color_mask = np.zeros((H, W, 3), dtype=np.uint8)

    for class_id, color in ATR_COLORS.items():
        color_mask[mask == class_id] = color

    Image.fromarray(color_mask).save(save_path)
    print(f"Output mask saved at: {save_path}")

### All the pre-processing + input + output of the Model at same time

In [9]:
def human_parse(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt")
    outputs = model(**inputs)
    logits = outputs.logits
    upsampled = torch.nn.functional.interpolate(
        logits, size=image.size[::-1], mode="bilinear", align_corners=False
    )
    mask = upsampled.argmax(dim=1)[0].numpy()
    return image, mask

### Id's to Human readable Labels

In [10]:
from transformers import AutoConfig
config = AutoConfig.from_pretrained("matei-dorian/segformer-b5-finetuned-human-parsing")
id2label = config.id2label
id2label

{0: 'Background',
 1: 'Hat',
 2: 'Hair',
 3: 'Sunglasses',
 4: 'Upper-clothes',
 5: 'Skirt',
 6: 'Pants',
 7: 'Dress',
 8: 'Belt',
 9: 'Left-shoe',
 10: 'Right-shoe',
 11: 'Face',
 12: 'Left-leg',
 13: 'Right-leg',
 14: 'Left-arm',
 15: 'Right-arm',
 16: 'Bag',
 17: 'Scarf'}

## Now we can do something like this

In [11]:
image, mask = human_parse(img_path)
visualize_mask(mask)

FileNotFoundError: [Errno 2] No such file or directory: 'D:/Code Playground/wardrob-ai/experiments/output/masks/parsed_output.png'

## Now its time to extract the required mask

##### 1. How do we know which masks are required?
- We know this because all virtual try-on research papers and official implementations follow the same preprocessing stage.
    Here are the main sources:

    - VITON (2018) paper + code

    - CP-VTON (2018)

    - CP-VTON+ (2020)

    - VITON-HD (2022)

    - HR-VITON (2022)

    - TryOnDiffusion (2023)

##### 2. Why the current Raw segmentation mask is not enough?
- because it looks like this: ``` [0 2 4 6 9 10 11 ...] ``` and this is only a class label map.
- The try-on system needs binary masks for:
    - removing the old clothes
    - protecting skin
    - fitting new clothes
    - warping garments
    - generating the human body shape
- You cannot feed class IDs directly into VTON pipelines.

##### 3. So what masks we need?
- mask_upper.png
- mask_skin.png
- mask_hair.png
- mask_body.png
- agnostic_person.png

In [None]:
import numpy as np
from PIL import Image


# ========= ATR CLASS LABELS (Your model, 18 classes) =========
ATR_ID = {
    "background": 0,
    "hat": 1,
    "hair": 2,
    "sunglasses": 3,
    "upper": 4,       # Upper-clothes
    "skirt": 5,
    "pants": 6,
    "dress": 7,
    "belt": 8,
    "left_shoe": 9,
    "right_shoe": 10,
    "face": 11,
    "left_leg": 12,
    "right_leg": 13,
    "left_arm": 14,
    "right_arm": 15,
    "bag": 16,
    "scarf": 17,
}


# ========= MASK GROUP DEFINITIONS =========
UPPER_CLOTHES = [ATR_ID["upper"], ATR_ID["dress"]]       # clothing to remove
SKIN = [ATR_ID["face"], ATR_ID["left_arm"], ATR_ID["right_arm"],
        ATR_ID["left_leg"], ATR_ID["right_leg"]]
HAIR = [ATR_ID["hair"]]
BACKGROUND = [ATR_ID["background"]]


def create_mask(pred_mask, class_ids):
    """
    Create a binary mask for given class IDs.
    """
    mask = np.isin(pred_mask, class_ids).astype(np.uint8) * 255
    return mask


def extract_masks(pred_mask, original_image, out_dir="masks"):
    import os
    os.makedirs(out_dir, exist_ok=True)
    """
    Extract and save required WardrobeAI masks:
        - upper clothes
        - skin
        - hair
        - full body (person)
        - agnostic person (clothes removed)
    """

    H, W = pred_mask.shape

    # Convert PIL image → numpy
    img_np = np.array(original_image)

    # MASK: UPPER CLOTHES
    mask_upper = create_mask(pred_mask, UPPER_CLOTHES)
    Image.fromarray(mask_upper).save(f"{out_dir}/mask_upper.png")

    # MASK: SKIN
    mask_skin = create_mask(pred_mask, SKIN)
    Image.fromarray(mask_skin).save(f"{out_dir}/mask_skin.png")

    # 3. MASK: HAIR
    mask_hair = create_mask(pred_mask, HAIR)
    Image.fromarray(mask_hair).save(f"{out_dir}/mask_hair.png")

    # MASK: PERSON / BODY
    all_body_classes = list(range(1, 18))   # everything except background
    mask_body = create_mask(pred_mask, all_body_classes)
    Image.fromarray(mask_body).save(f"{out_dir}/mask_body.png")

    # AGNOSTIC PERSON
    # Start with original
    agnostic = img_np.copy()

    # Remove upper clothes → fill with gray
    agnostic[mask_upper == 255] = [128, 128, 128]

    # Keep skin, face, hair, arms, etc. as is
    # Background remains background

    Image.fromarray(agnostic).save(f"{out_dir}/agnostic_person.png")
    print("All masks saved in:", out_dir)

In [None]:
extract_masks(mask, image, out_dir = "D:/Code Playground/wardrob-ai/experiments/masks")

All masks saved in: D:/Code Playground/wardrob-ai/experiments/masks


## Connecting all the Dots

##### Final pipeline to automate segemantaion + mask extraction

In [None]:
def run_segmentation_pipeline(image_path):
    image, mask = human_parse(image_path)
    visualize_mask(mask)
    extract_masks(mask, image, out_dir = "D:/Code Playground/wardrob-ai/experiments/output/masks")

In [None]:
run_segmentation_pipeline(img_path)

Output mask saved at: parsed_output.png
All masks saved in: D:/Code Playground/wardrob-ai/experiments/masks
