In [7]:
import os
from transformers import SegformerImageProcessor, AutoModelForSemanticSegmentation
from PIL import Image
import torch.nn as nn
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
model_name = 'sayeed99/segformer_b3_clothes'

In [None]:
# Load the image processor and the pre-trained model for semantic segmentation
processor = SegformerImageProcessor.from_pretrained(model_name)
model = AutoModelForSemanticSegmentation.from_pretrained(model_name)

# Set the device to GPU if available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # Move the model to the selected device

In [None]:
predict_img = '/home/jcaldeira/dressing_virtuel_data_collector/media/tmp/detection/100_0144.JPG'

image = Image.open(predict_img)
if image.mode != "RGB":
    image = image.convert("RGB")

inputs = processor(images=image, return_tensors="pt")

# Move inputs to the same device as the model
inputs = {k: v.to(device) for k, v in inputs.items()}

# Perform inference to obtain logits from the model
outputs = model(**inputs)
logits = outputs.logits

# Upsample the logits to match the original image size
upsampled_logits = nn.functional.interpolate(
    logits,
    size=image.size[::-1],  # Reverse the size to (height, width)
    mode="bilinear",
    align_corners=False,
)

# Generate segmentation map and filter valid labels
pred_seg = upsampled_logits.argmax(dim=1)[0].cpu().numpy()
probabilities = nn.functional.softmax(upsampled_logits, dim=1)

certainty_mask = probabilities.max(dim=1).values > 0.7
certainty_mask_np = certainty_mask.squeeze().cpu().numpy()
valid_labels = [lbl for lbl in np.unique(pred_seg[certainty_mask_np]) if lbl in [4, 5, 6, 7]]

for label in valid_labels:
    # Convert the segmentation map to a binary mask for the target class
    target_class = label  # Define the class to be extracted
    binary_mask = (pred_seg == target_class).astype(np.uint8)

    # Find the bounding box of the target segment using non-zero indices of the binary mask
    non_zero_indices = np.nonzero(binary_mask)
    
    # Calculate the bounding box limits (min and max coordinates)
    min_y, max_y = np.min(non_zero_indices[0]), np.max(non_zero_indices[0])
    min_x, max_x = np.min(non_zero_indices[1]), np.max(non_zero_indices[1])

    # Crop the original image using the calculated bounding box limits
    cropped_image = image.crop((min_x, min_y, max_x, max_y))

    # Display the cropped image
    plt.imshow(cropped_image)
    plt.axis("off")
    plt.show()

In [None]:
from transformers import AutoModelForSemanticSegmentation, SegformerImageProcessor
from datasets import load_dataset
import evaluate
import torch

### fashion_segmentation

In [None]:
dataset = load_dataset("sayeed99/fashion_segmentation")


In [None]:
dataset

In [None]:
plt.imshow(dataset['train'][2]['label'])
plt.axis('off')
plt.show()

In [None]:
from transformers import SegformerImageProcessor, AutoModelForSemanticSegmentation
from datasets import load_dataset
import torch
import numpy as np
from sklearn.metrics import confusion_matrix
from PIL import Image
import matplotlib.pyplot as plt
import torch.nn.functional as F


# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


# Load the processor and model
processor = SegformerImageProcessor.from_pretrained("sayeed99/segformer_b3_clothes")
model = AutoModelForSemanticSegmentation.from_pretrained("sayeed99/segformer_b3_clothes").to(device)
model.eval()

# Load the dataset (replace 'test' with your desired split)
dataset = load_dataset("sayeed99/fashion_segmentation", split="train")

# Function to calculate metrics
def evaluate_segmentation(predictions, ground_truth, num_classes):
    confusion = confusion_matrix(ground_truth.flatten(), predictions.flatten(), labels=range(num_classes))
    pixel_accuracy = np.diag(confusion).sum() / confusion.sum()
    class_iou = np.diag(confusion) / (confusion.sum(axis=1) + confusion.sum(axis=0) - np.diag(confusion))
    mean_iou = np.nanmean(class_iou)
    return pixel_accuracy, mean_iou, class_iou

# Placeholder for evaluation
num_classes = 17  # Adjust based on your labels
all_preds = []
all_labels = []

for idx, sample in enumerate(dataset.select(range(10))):
    # Preprocess the image
    image = Image.fromarray(np.array(sample['image']))
    inputs = processor(images=image, return_tensors="pt").to(device)
    
    # Get ground truth mask
    ground_truth = np.array(sample['label'])
    
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits  # Shape: [batch_size, num_classes, height, width]
        # Resize logits to match mask size
        logits_resized = F.interpolate(
            logits, 
            size=ground_truth.shape,  # (height, width) format
            mode="bilinear",
            align_corners=False
        )
        predictions = torch.argmax(logits_resized.squeeze(0), dim=0).cpu().numpy()  # Predicted mask

    print(f"Prediction shape after resizing: {predictions.shape}, Ground truth shape: {ground_truth.shape}")
    
    # Store predictions and labels
    all_preds.append(predictions)
    all_labels.append(ground_truth)

# Combine all predictions and labels
print(f"Prediction shape: {predictions.shape}, Ground truth shape: {ground_truth.shape}")

all_preds = np.concatenate([p.flatten() for p in all_preds])
all_labels = np.concatenate([l.flatten() for l in all_labels])

# Evaluate
pixel_accuracy, mean_iou, class_iou = evaluate_segmentation(all_preds, all_labels, num_classes)
print(f"Pixel Accuracy: {pixel_accuracy:.4f}")
print(f"Mean IoU: {mean_iou:.4f}")
print(f"Per-Class IoU: {class_iou}")

# Visualize a sample result
def visualize_sample(image, ground_truth, prediction):
    plt.figure(figsize=(15, 5))
    plt.subplot(1, 3, 1)
    plt.imshow(image)
    plt.title("Original Image")
    plt.axis("off")
    
    plt.subplot(1, 3, 2)
    plt.imshow(ground_truth, cmap="jet", alpha=0.7)
    plt.title("Ground Truth")
    plt.axis("off")
    
    plt.subplot(1, 3, 3)
    plt.imshow(prediction, cmap="jet", alpha=0.7)
    plt.title("Prediction")
    plt.axis("off")
    plt.show()



In [121]:
# Example visualization
lookfor = 8
start_position = 0
for i, smp in enumerate(dataset.select(range(10))):
    if i != lookfor:
        start_position = start_position + np.array(dataset[i]['label']).size
        continue
    break



In [None]:
sample_image = np.array(dataset[lookfor]['image'])
sample_ground_truth = np.array(dataset[lookfor]['label'])
end_position = start_position + np.array(dataset[lookfor]['label']).size
sample_prediction = all_preds[start_position:end_position].reshape(sample_ground_truth.shape)
visualize_sample(sample_image, sample_ground_truth, sample_prediction)

### human_parsing_dataset

In [2]:
from datasets import load_dataset

dataset = load_dataset("mattmdjaga/human_parsing_dataset", split="train")

In [3]:
dataset

Dataset({
    features: ['image', 'mask'],
    num_rows: 17706
})

In [10]:
ds_test = dataset.train_test_split(test_size=0.05)


In [11]:
ds_test['test']

Dataset({
    features: ['image', 'mask'],
    num_rows: 886
})

In [None]:
import matplotlib.pyplot as plt

plt.imshow(ds_test['test'][0]['mask'])
plt.axis('off')
plt.show()

In [12]:
from transformers import SegformerImageProcessor, AutoModelForSemanticSegmentation
from datasets import load_dataset
import torch
import numpy as np
from sklearn.metrics import confusion_matrix
from PIL import Image
import matplotlib.pyplot as plt
import torch.nn.functional as F


# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


# Load the processor and model
processor = SegformerImageProcessor.from_pretrained("sayeed99/segformer_b3_clothes")
model = AutoModelForSemanticSegmentation.from_pretrained("sayeed99/segformer_b3_clothes").to(device)
model.eval()

# Load the dataset (replace 'test' with your desired split)
dataset = load_dataset("mattmdjaga/human_parsing_dataset", split="train")

# Function to calculate metrics
def evaluate_segmentation(predictions, ground_truth, num_classes):
    confusion = confusion_matrix(ground_truth.flatten(), predictions.flatten(), labels=range(num_classes))
    pixel_accuracy = np.diag(confusion).sum() / confusion.sum()
    class_iou = np.diag(confusion) / (confusion.sum(axis=1) + confusion.sum(axis=0) - np.diag(confusion))
    mean_iou = np.nanmean(class_iou)
    return pixel_accuracy, mean_iou, class_iou

# Placeholder for evaluation
num_classes = 17  # Adjust based on your labels
all_preds = []
all_labels = []

ds_train_test = dataset.train_test_split(test_size=0.05)
ds_test = ds_train_test['test']

for idx, sample in enumerate(ds_test):
    # Preprocess the image
    image = Image.fromarray(np.array(sample['image']))
    inputs = processor(images=image, return_tensors="pt").to(device)
    
    # Get ground truth mask
    ground_truth = np.array(sample['mask'])
    
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits  # Shape: [batch_size, num_classes, height, width]
        # Resize logits to match mask size
        logits_resized = F.interpolate(
            logits, 
            size=ground_truth.shape,  # (height, width) format
            mode="bilinear",
            align_corners=False
        )
        predictions = torch.argmax(logits_resized.squeeze(0), dim=0).cpu().numpy()  # Predicted mask

    #print(f"Prediction shape after resizing: {predictions.shape}, Ground truth shape: {ground_truth.shape}")
    
    # Store predictions and labels
    all_preds.append(predictions)
    all_labels.append(ground_truth)

# Combine all predictions and labels
#print(f"Prediction shape: {predictions.shape}, Ground truth shape: {ground_truth.shape}")

all_preds = np.concatenate([p.flatten() for p in all_preds])
all_labels = np.concatenate([l.flatten() for l in all_labels])

# Evaluate
pixel_accuracy, mean_iou, class_iou = evaluate_segmentation(all_preds, all_labels, num_classes)
print(f"Pixel Accuracy: {pixel_accuracy:.4f}")
print(f"Mean IoU: {mean_iou:.4f}")
print(f"Per-Class IoU: {class_iou}")

# Visualize a sample result
def visualize_sample(image, ground_truth, prediction):
    plt.figure(figsize=(15, 5))
    plt.subplot(1, 3, 1)
    plt.imshow(image)
    plt.title("Original Image")
    plt.axis("off")
    
    plt.subplot(1, 3, 2)
    plt.imshow(ground_truth, cmap="jet", alpha=0.7)
    plt.title("Ground Truth")
    plt.axis("off")
    
    plt.subplot(1, 3, 3)
    plt.imshow(prediction, cmap="jet", alpha=0.7)
    plt.title("Prediction")
    plt.axis("off")
    plt.show()



Using device: cuda
Pixel Accuracy: 0.9618
Mean IoU: 0.6455
Per-Class IoU: [0.98423157 0.79002363 0.81309402 0.61262898 0.89896534 0.89910038
 0.8977645  0.90094456 0.46674161 0.36851916 0.23119747 0.85106432
 0.40308592 0.25615023 0.431395   0.36301823 0.80618312]


In [None]:
# Example visualization
idx=8
sample_image = np.array(ds_test[idx]['image'])
sample_ground_truth = np.array(ds_test[idx]['mask'])
start_position = (idx) * sample_ground_truth.size
end_position = (idx+1) * sample_ground_truth.size
sample_prediction = all_preds[start_position:end_position].reshape(sample_ground_truth.shape)
visualize_sample(sample_image, sample_ground_truth, sample_prediction)
