In [None]:
from google.colab import drive
drive.mount('/content/drive')
import kagglehub
import os
from sklearn.model_selection import train_test_split
import random
import matplotlib.pyplot as plt
from PIL import Image, ImageOps
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.models import mobilenet_v3_large, MobileNet_V3_Large_Weights
import numpy as np
import pandas as pd
from tqdm import tqdm
import torchvision.transforms as T
import torchvision.transforms.functional as TF
from torchvision.transforms.functional import InterpolationMode
import torch.backends.cudnn as cudnn
from torch.cuda.amp import autocast, GradScaler

nikhilroxtomar_person_segmentation_path = kagglehub.dataset_download('nikhilroxtomar/person-segmentation')
tapakah68_supervisely_filtered_segmentation_person_dataset_path = kagglehub.dataset_download('tapakah68/supervisely-filtered-segmentation-person-dataset')

print('Data source import complete.')
print(nikhilroxtomar_person_segmentation_path)
print(tapakah68_supervisely_filtered_segmentation_person_dataset_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using Colab cache for faster access to the 'person-segmentation' dataset.
Using Colab cache for faster access to the 'supervisely-filtered-segmentation-person-dataset' dataset.
Data source import complete.
/kaggle/input/person-segmentation
/kaggle/input/supervisely-filtered-segmentation-person-dataset


In [None]:
images_path =  "/kaggle/input/supervisely-filtered-segmentation-person-dataset/supervisely_person_clean_2667_img/supervisely_person_clean_2667_img/"
masks_path  = "/kaggle/input/supervisely-filtered-segmentation-person-dataset/supervisely_person_clean_2667_img/supervisely_person_clean_2667_img/"

images_path2="/kaggle/input/person-segmentation/people_segmentation/images/"
masks_path2="/kaggle/input/person-segmentation/people_segmentation/masks/"
df =  pd.read_csv('/kaggle/input/supervisely-filtered-segmentation-person-dataset/df.csv')
df.head()
images_path2_list = sorted(os.listdir(images_path2))
masks_path2_list = sorted(os.listdir(masks_path2))
df2 = df[["images", "masks"]].copy()
df2["images"] = df2['images'].apply(lambda x: images_path + x)
df2["masks"]  = df2['masks'].apply(lambda x: masks_path + x)
df2["coef"]   = 1

df3 = pd.DataFrame({
    "images": [images_path2 + elt for elt in images_path2_list],
    "masks":  [masks_path2 + elt for elt in masks_path2_list],
    "coef":   255
})

final_df = pd.concat([df2, df3], ignore_index=True)

X_train_raw, X_test_raw  =  train_test_split(final_df, test_size=0.1, random_state=42)

In [None]:
final_df = final_df.copy()
final_df["source"] = np.where(final_df["coef"] == 1, "supervisely", "person_seg")
print("Tổng số mẫu:", len(final_df))
print("\nSố mẫu theo nguồn:")
print(final_df["source"].value_counts())

def dice_loss(logits, target, eps=1e-6):
    pred = torch.sigmoid(logits)
    target = target.float()
    intersection = (pred * target).sum()
    return 1 - (2. * intersection + eps) / (pred.sum() + target.sum() + eps)

def dice_score_from_logits(logits, target, thr=0.5, eps=1e-6):
    prob = torch.sigmoid(logits)
    pred = (prob > thr).float()
    inter = (pred * target).sum()
    union = pred.sum() + target.sum()
    return ((2*inter + eps) / (union + eps)).item()

def logits_to_probs_preds(logits, thr=0.5):
    probs = torch.sigmoid(logits)
    preds = (probs > thr).float()
    return probs, preds

def batch_pixel_accuracy(preds, targets):
    correct = (preds == targets).float().sum()
    total = torch.numel(targets)
    return (correct / total).item()

def batch_iou(preds, targets, eps=1e-6):
    inter = (preds * targets).sum(dim=(1,2,3))
    union = (preds + targets - preds*targets).sum(dim=(1,2,3))
    iou_per_image = ((inter + eps) / (union + eps))
    return iou_per_image.mean().item()

def batch_dice(preds, targets, eps=1e-6):
    inter = (preds * targets).sum(dim=(1,2,3))
    denom = preds.sum(dim=(1,2,3)) + targets.sum(dim=(1,2,3))
    dice_per_image = ((2*inter + eps) / (denom + eps))
    return dice_per_image.mean().item()

def batch_precision_recall_f1(preds, targets, eps=1e-6):
    tp = (preds * targets).sum(dim=(1,2,3))
    fp = (preds * (1 - targets)).sum(dim=(1,2,3))
    fn = ((1 - preds) * targets).sum(dim=(1,2,3))

    precision = (tp + eps) / (tp + fp + eps)
    recall = (tp + eps) / (tp + fn + eps)
    f1 = (2 * precision * recall + eps) / (precision + recall + eps)

    return precision.mean().item(), recall.mean().item(), f1.mean().item()

Tổng số mẫu: 8345

Số mẫu theo nguồn:
source
person_seg     5678
supervisely    2667
Name: count, dtype: int64


In [None]:
IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD  = [0.229, 0.224, 0.225]

def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

class PreprocessedSegDataset(Dataset):
    def __init__(self, df, size=512, augment=False, seed: int = None):
        self.df = df.reset_index(drop=True)
        self.size = size
        self.augment = augment
        if seed is not None:
            set_seed(seed)
        #color jitter + flip + small rotate + random crop
        self.color_jitter = T.ColorJitter(brightness=0.15, contrast=0.15, saturation=0.1, hue=0.02)

    def __len__(self):
        return len(self.df)

    def _resize(self, img, mask):
        img = img.resize((self.size, self.size), Image.BILINEAR)
        mask = mask.resize((self.size, self.size), Image.NEAREST)
        return img, mask

    def _augment(self, img, mask):
        # Horizontal flip
        if random.random() < 0.5:
            img = ImageOps.mirror(img)
            mask = ImageOps.mirror(mask)

        # small rotation: use InterpolationMode to avoid 'resample' kwarg error
        if random.random() < 0.3:
            angle = random.uniform(-12, 12)
            img = TF.rotate(img, angle, interpolation=InterpolationMode.BILINEAR, expand=False)
            mask = TF.rotate(mask, angle, interpolation=InterpolationMode.NEAREST, expand=False)

        # random crop + resize (mild)
        if random.random() < 0.25:
            w, h = img.size
            scale = random.uniform(0.88, 1.0)
            new_w, new_h = int(w*scale), int(h*scale)
            left = random.randint(0, max(0, w-new_w))
            top  = random.randint(0, max(0, h-new_h))
            img = img.crop((left, top, left+new_w, top+new_h))
            mask = mask.crop((left, top, left+new_w, top+new_h))
            img = img.resize((self.size, self.size), Image.BILINEAR)
            mask = mask.resize((self.size, self.size), Image.NEAREST)

        # color jitter (applied to PIL image)
        img = self.color_jitter(img)

        return img, mask

    def __getitem__(self, idx):
        row = self.df.loc[idx]
        img_path  = row["images"]
        mask_path = row["masks"]
        coef      = row.get("coef", 1)

        img = Image.open(img_path).convert("RGB")
        mask = Image.open(mask_path).convert("L")

        # deterministic resize first
        img, mask = self._resize(img, mask)

        # augment
        if self.augment:
            img, mask = self._augment(img, mask)

        # --- Convert to numpy ---
        img_np = np.array(img, dtype=np.float32)
        mask_np = np.array(mask, dtype=np.float32)

        # --- Robust mask normalization like Raw ---
        max_val = mask_np.max() if mask_np.max() > 0 else 1.0
        if max_val > 1.0:
            mask_np = mask_np / max_val

        mask_bin = (mask_np >= 0.5).astype(np.float32)

        # image scaling
        img_np = img_np / 255.0

        # to tensor
        img_t = torch.from_numpy(img_np).permute(2,0,1).float()
        mask_t = torch.from_numpy(mask_bin).unsqueeze(0).float()

        # ImageNet normalize
        mean = torch.tensor(IMAGENET_MEAN, dtype=torch.float32).view(3,1,1)
        std  = torch.tensor(IMAGENET_STD, dtype=torch.float32).view(3,1,1)
        img_t = (img_t - mean) / std

        return img_t, mask_t


class ASPP(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, 1)
        self.conv6 = nn.Conv2d(in_channels, out_channels, 3, padding=6, dilation=6)
        self.conv12 = nn.Conv2d(in_channels, out_channels, 3, padding=12, dilation=12)
        self.conv18 = nn.Conv2d(in_channels, out_channels, 3, padding=18, dilation=18)

        self.global_pool = nn.Sequential(
            nn.AdaptiveAvgPool2d(1),
            nn.Conv2d(in_channels, out_channels, 1),
            nn.ReLU(inplace=True)
        )

        self.out_conv = nn.Conv2d(out_channels * 5, out_channels, 1)

    def forward(self, x):
        h, w = x.shape[2], x.shape[3]
        y1 = self.conv1(x)
        y2 = self.conv6(x)
        y3 = self.conv12(x)
        y4 = self.conv18(x)
        y5 = self.global_pool(x)
        y5 = F.interpolate(y5, size=(h, w), mode="bilinear", align_corners=False)
        y = torch.cat([y1, y2, y3, y4, y5], dim=1)
        return self.out_conv(y)

class DeepLabV3_MobileNetV3Large(nn.Module):
    def __init__(self, num_classes=1):
        super().__init__()
        base = mobilenet_v3_large(weights=MobileNet_V3_Large_Weights.DEFAULT)
        self.backbone = base.features
        backbone_out = 960
        self.aspp = ASPP(backbone_out, 256)
        self.decoder = nn.Sequential(
            nn.Conv2d(256, 128, 3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, num_classes, 1)
        )

    def forward(self, x):
        x = self.backbone(x)
        x = self.aspp(x)
        x = self.decoder(x)
        x = F.interpolate(x, size=(512, 512), mode="bilinear", align_corners=False)
        return x

train_ds = PreprocessedSegDataset(X_train_raw, size=512, augment=True, seed=42)
val_ds   = PreprocessedSegDataset(X_test_raw,  size=512, augment=False, seed=42)
history_name = "training_history_preprocessed_metrics.csv"




device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DeepLabV3_MobileNetV3Large(num_classes=1).to(device)

criterion_bce = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

train_loader = DataLoader(train_ds, batch_size=8, shuffle=True, num_workers=2, pin_memory=True)
val_loader   = DataLoader(val_ds,   batch_size=8, shuffle=False, num_workers=2, pin_memory=True)

num_epochs = 20
history = {
    "train_loss": [],
    "val_loss": [],
    "val_iou": [],
    "val_dice": [],
    "val_acc": [],
    "val_prec": [],
    "val_recall": [],
    "val_f1": []
}

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    train_iou = 0.0
    train_dice = 0.0
    train_acc = 0.0
    train_steps = 0

    for imgs, masks in tqdm(train_loader, desc=f"Train epoch {epoch+1}"):
        imgs = imgs.to(device).float()
        masks = masks.to(device).float()

        logits = model(imgs)
        bce = criterion_bce(logits, masks)
        dsc = dice_loss(logits, masks)
        loss = bce + dsc

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        with torch.no_grad():
            _, preds = logits_to_probs_preds(logits, thr=0.5)
            train_iou += batch_iou(preds, masks)
            train_dice += batch_dice(preds, masks)
            train_acc += batch_pixel_accuracy(preds, masks)
            train_steps += 1

    avg_loss = running_loss / max(1, len(train_loader))
    avg_train_iou = train_iou / max(1, train_steps)
    avg_train_dice = train_dice / max(1, train_steps)
    avg_train_acc = train_acc / max(1, train_steps)
    print(f"Epoch {epoch+1} - train avg loss: {avg_loss:.4f} | IoU: {avg_train_iou:.4f} | Dice: {avg_train_dice:.4f} | Acc: {avg_train_acc:.4f}")
    history["train_loss"].append(avg_loss)

    model.eval()
    val_loss = 0.0
    val_steps = 0

    sum_iou = 0.0
    sum_dice = 0.0
    sum_acc = 0.0
    sum_prec = 0.0
    sum_recall = 0.0
    sum_f1 = 0.0

    with torch.no_grad():
        for imgs, masks in val_loader:
            imgs = imgs.to(device).float()
            masks = masks.to(device).float()

            logits = model(imgs)
            bce = criterion_bce(logits, masks)
            dsc = dice_loss(logits, masks)
            batch_loss = (bce + dsc).item()
            val_loss += batch_loss

            probs, preds = logits_to_probs_preds(logits, thr=0.5)

            # metrics
            iou_b = batch_iou(preds, masks)
            dice_b = batch_dice(preds, masks)
            acc_b = batch_pixel_accuracy(preds, masks)
            prec_b, recall_b, f1_b = batch_precision_recall_f1(preds, masks)

            sum_iou += iou_b
            sum_dice += dice_b
            sum_acc += acc_b
            sum_prec += prec_b
            sum_recall += recall_b
            sum_f1 += f1_b

            val_steps += 1

    avg_val_loss = val_loss / max(1, val_steps)
    avg_val_iou = sum_iou / max(1, val_steps)
    avg_val_dice = sum_dice / max(1, val_steps)
    avg_val_acc = sum_acc / max(1, val_steps)
    avg_val_prec = sum_prec / max(1, val_steps)
    avg_val_recall = sum_recall / max(1, val_steps)
    avg_val_f1 = sum_f1 / max(1, val_steps)

    history["val_loss"].append(avg_val_loss)
    history["val_iou"].append(avg_val_iou)
    history["val_dice"].append(avg_val_dice)
    history["val_acc"].append(avg_val_acc)
    history["val_prec"].append(avg_val_prec)
    history["val_recall"].append(avg_val_recall)
    history["val_f1"].append(avg_val_f1)

    print(f"Val loss: {avg_val_loss:.4f} | IoU: {avg_val_iou:.4f} | Dice: {avg_val_dice:.4f} | Acc: {avg_val_acc:.4f}")
    print(f"Precision: {avg_val_prec:.4f} | Recall: {avg_val_recall:.4f} | F1: {avg_val_f1:.4f}")

# save history
hist_df = pd.DataFrame(history)
out_path = f"/content/drive/MyDrive/Data Mining/Project/Model/{history_name}"
os.makedirs(os.path.dirname(out_path), exist_ok=True)
hist_df.to_csv(out_path, index=False)
print(f"Saved training history to Drive ({history_name})")
save_path = "/content/drive/MyDrive/Data Mining/Project/Model/deeplabv3_mbv3_preprocessing.pth"

torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
}, save_path)

print(f"Model saved to {save_path}")

Train epoch 1: 100%|██████████| 939/939 [07:50<00:00,  2.00it/s]

Epoch 1 - train avg loss: 0.2739 | IoU: 0.7788 | Dice: 0.8614 | Acc: 0.9443





Val loss: 0.1785 | IoU: 0.8333 | Dice: 0.8997 | Acc: 0.9650
Precision: 0.8967 | Recall: 0.9155 | F1: 0.9020


Train epoch 2: 100%|██████████| 939/939 [07:50<00:00,  2.00it/s]

Epoch 2 - train avg loss: 0.1759 | IoU: 0.8359 | Dice: 0.9017 | Acc: 0.9648





Val loss: 0.1519 | IoU: 0.8488 | Dice: 0.9099 | Acc: 0.9703
Precision: 0.9186 | Recall: 0.9114 | F1: 0.9099


Train epoch 3: 100%|██████████| 939/939 [07:47<00:00,  2.01it/s]

Epoch 3 - train avg loss: 0.1526 | IoU: 0.8500 | Dice: 0.9108 | Acc: 0.9697





Val loss: 0.1440 | IoU: 0.8548 | Dice: 0.9126 | Acc: 0.9722
Precision: 0.9256 | Recall: 0.9121 | F1: 0.9138


Train epoch 4: 100%|██████████| 939/939 [07:52<00:00,  1.99it/s]

Epoch 4 - train avg loss: 0.1367 | IoU: 0.8595 | Dice: 0.9169 | Acc: 0.9729





Val loss: 0.1385 | IoU: 0.8637 | Dice: 0.9188 | Acc: 0.9732
Precision: 0.9064 | Recall: 0.9402 | F1: 0.9224


Train epoch 5: 100%|██████████| 939/939 [07:56<00:00,  1.97it/s]

Epoch 5 - train avg loss: 0.1261 | IoU: 0.8666 | Dice: 0.9214 | Acc: 0.9750





Val loss: 0.1330 | IoU: 0.8673 | Dice: 0.9205 | Acc: 0.9749
Precision: 0.9128 | Recall: 0.9350 | F1: 0.9252


Train epoch 6: 100%|██████████| 939/939 [07:54<00:00,  1.98it/s]

Epoch 6 - train avg loss: 0.1184 | IoU: 0.8724 | Dice: 0.9253 | Acc: 0.9766





Val loss: 0.1423 | IoU: 0.8627 | Dice: 0.9177 | Acc: 0.9733
Precision: 0.9200 | Recall: 0.9233 | F1: 0.9201


Train epoch 7: 100%|██████████| 939/939 [07:57<00:00,  1.97it/s]

Epoch 7 - train avg loss: 0.1123 | IoU: 0.8766 | Dice: 0.9280 | Acc: 0.9778





Val loss: 0.1303 | IoU: 0.8666 | Dice: 0.9196 | Acc: 0.9755
Precision: 0.9380 | Recall: 0.9137 | F1: 0.9208


Train epoch 8: 100%|██████████| 939/939 [07:51<00:00,  1.99it/s]

Epoch 8 - train avg loss: 0.1076 | IoU: 0.8790 | Dice: 0.9294 | Acc: 0.9786





Val loss: 0.1251 | IoU: 0.8709 | Dice: 0.9230 | Acc: 0.9764
Precision: 0.9333 | Recall: 0.9231 | F1: 0.9253


Train epoch 9: 100%|██████████| 939/939 [07:52<00:00,  1.99it/s]

Epoch 9 - train avg loss: 0.1034 | IoU: 0.8824 | Dice: 0.9316 | Acc: 0.9795





Val loss: 0.1225 | IoU: 0.8741 | Dice: 0.9250 | Acc: 0.9771
Precision: 0.9340 | Recall: 0.9253 | F1: 0.9273


Train epoch 10: 100%|██████████| 939/939 [07:52<00:00,  1.99it/s]

Epoch 10 - train avg loss: 0.1032 | IoU: 0.8837 | Dice: 0.9324 | Acc: 0.9796





Val loss: 0.1286 | IoU: 0.8674 | Dice: 0.9196 | Acc: 0.9760
Precision: 0.9375 | Recall: 0.9136 | F1: 0.9219


Train epoch 11: 100%|██████████| 939/939 [07:46<00:00,  2.01it/s]

Epoch 11 - train avg loss: 0.0994 | IoU: 0.8865 | Dice: 0.9344 | Acc: 0.9802





Val loss: 0.1207 | IoU: 0.8769 | Dice: 0.9267 | Acc: 0.9775
Precision: 0.9288 | Recall: 0.9326 | F1: 0.9279


Train epoch 12: 100%|██████████| 939/939 [07:48<00:00,  2.00it/s]

Epoch 12 - train avg loss: 0.0931 | IoU: 0.8899 | Dice: 0.9364 | Acc: 0.9814





Val loss: 0.1182 | IoU: 0.8789 | Dice: 0.9272 | Acc: 0.9780
Precision: 0.9228 | Recall: 0.9412 | F1: 0.9292


Train epoch 13: 100%|██████████| 939/939 [07:48<00:00,  2.00it/s]

Epoch 13 - train avg loss: 0.0945 | IoU: 0.8896 | Dice: 0.9362 | Acc: 0.9812





Val loss: 0.1182 | IoU: 0.8796 | Dice: 0.9285 | Acc: 0.9782
Precision: 0.9290 | Recall: 0.9361 | F1: 0.9320


Train epoch 14: 100%|██████████| 939/939 [07:57<00:00,  1.97it/s]

Epoch 14 - train avg loss: 0.0925 | IoU: 0.8917 | Dice: 0.9377 | Acc: 0.9816





Val loss: 0.1205 | IoU: 0.8775 | Dice: 0.9271 | Acc: 0.9778
Precision: 0.9361 | Recall: 0.9247 | F1: 0.9316


Train epoch 15: 100%|██████████| 939/939 [07:57<00:00,  1.97it/s]

Epoch 15 - train avg loss: 0.0891 | IoU: 0.8937 | Dice: 0.9387 | Acc: 0.9823





Val loss: 0.1156 | IoU: 0.8817 | Dice: 0.9301 | Acc: 0.9789
Precision: 0.9272 | Recall: 0.9391 | F1: 0.9322


Train epoch 16: 100%|██████████| 939/939 [07:56<00:00,  1.97it/s]

Epoch 16 - train avg loss: 0.0868 | IoU: 0.8955 | Dice: 0.9400 | Acc: 0.9827





Val loss: 0.1141 | IoU: 0.8808 | Dice: 0.9285 | Acc: 0.9794
Precision: 0.9375 | Recall: 0.9280 | F1: 0.9320


Train epoch 17: 100%|██████████| 939/939 [07:49<00:00,  2.00it/s]

Epoch 17 - train avg loss: 0.0856 | IoU: 0.8963 | Dice: 0.9404 | Acc: 0.9829





Val loss: 0.1222 | IoU: 0.8804 | Dice: 0.9287 | Acc: 0.9780
Precision: 0.9227 | Recall: 0.9431 | F1: 0.9311


Train epoch 18: 100%|██████████| 939/939 [07:50<00:00,  2.00it/s]

Epoch 18 - train avg loss: 0.0842 | IoU: 0.8967 | Dice: 0.9405 | Acc: 0.9831





Val loss: 0.1140 | IoU: 0.8845 | Dice: 0.9311 | Acc: 0.9798
Precision: 0.9351 | Recall: 0.9355 | F1: 0.9335


Train epoch 19: 100%|██████████| 939/939 [07:45<00:00,  2.02it/s]

Epoch 19 - train avg loss: 0.0844 | IoU: 0.8980 | Dice: 0.9416 | Acc: 0.9832





Val loss: 0.1313 | IoU: 0.8795 | Dice: 0.9285 | Acc: 0.9773
Precision: 0.9211 | Recall: 0.9451 | F1: 0.9297


Train epoch 20: 100%|██████████| 939/939 [07:43<00:00,  2.03it/s]

Epoch 20 - train avg loss: 0.0828 | IoU: 0.8987 | Dice: 0.9419 | Acc: 0.9834





Val loss: 0.1111 | IoU: 0.8861 | Dice: 0.9325 | Acc: 0.9797
Precision: 0.9335 | Recall: 0.9392 | F1: 0.9348
Saved training history to Drive (training_history_preprocessed_metrics.csv)
Model saved to /content/drive/MyDrive/Data Mining/Project/Model/deeplabv3_mbv3_preprocessing.pth
