In [1]:
import os
import torch

# Setting up the environment to use only GPU 2
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Check number of GPUs
print(f"Number of GPUs: {torch.cuda.device_count()}")

# Check GPU name
print(f"GPU name: {torch.cuda.get_device_name()}")

# Check PyTorch version
print(f"PyTorch version: {torch.__version__}")

Using device: cuda
Number of GPUs: 1
GPU name: NVIDIA A100-SXM4-40GB
PyTorch version: 2.5.1+cu121


In [2]:
!nvidia-smi

/bin/bash: /data/students/mary/anaconda3/envs/AI231_ME6/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Fri Dec  6 16:15:22 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.154.05             Driver Version: 535.154.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          On  | 00000000:07:00.0 Off |                    0 |
| N/A   49C    P0             328W / 400W |  34420MiB / 40960MiB |     97%      Default |
|                                         |                      |             Disabled |
+-------------------------

In [10]:
from pathlib import Path

dataset_path = Path("/data/students/mary/mlops-exercises/ME6/dataset__all")  # replace with 'path/to/dataset' for your custom data
labels = sorted(dataset_path.rglob("*labels/*.txt"))  # all data in 'labels'
print(sorted(dataset_path.rglob("*labels")))

[PosixPath('/data/students/mary/mlops-exercises/ME6/dataset__all/train/labels'), PosixPath('/data/students/mary/mlops-exercises/ME6/dataset__all/val/labels')]


In [11]:
import yaml

yaml_file = "dataset.yaml" 
with open(yaml_file, "r", encoding="utf8") as y:
    data = yaml.safe_load(y)

names = data['names']
class_dict = {i: name for i, name in enumerate(names)}
cls_idx = sorted(class_dict.keys())

In [12]:
import pandas as pd

indx = [label.stem for label in labels]  # uses base filename as ID (no extension)
labels_df = pd.DataFrame([], columns=cls_idx, index=indx)

In [13]:
from collections import Counter

for label in labels:
    lbl_counter = Counter()

    with open(label, "r") as lf:
        lines = lf.readlines()

    for line in lines:
        # classes for YOLO label uses integer at first position of each line
        lbl_counter[int(line.split(" ")[0])] += 1

    labels_df.loc[label.stem] = lbl_counter

labels_df = labels_df.fillna(0.0)  # replace `nan` values with `0.0`

  labels_df = labels_df.fillna(0.0)  # replace `nan` values with `0.0`


KFold Splitting

In [14]:
from sklearn.model_selection import KFold

ksplit = 3
kf = KFold(n_splits=ksplit, shuffle=True, random_state=20)  # setting random_state for repeatable results

kfolds = list(kf.split(labels_df))

In [15]:
folds = [f"split_{n}" for n in range(1, ksplit + 1)]
folds_df = pd.DataFrame(index=indx, columns=folds)

for idx, (train, val) in enumerate(kfolds, start=1):
    folds_df[f"split_{idx}"].loc[labels_df.iloc[train].index] = "train"
    folds_df[f"split_{idx}"].loc[labels_df.iloc[val].index] = "val"

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  folds_df[f"split_{idx}"].loc[labels_df.iloc[train].index] = "train"
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to upda

In [16]:
fold_lbl_distrb = pd.DataFrame(index=folds, columns=cls_idx)

for n, (train_indices, val_indices) in enumerate(kfolds, start=1):
    train_totals = labels_df.iloc[train_indices].sum()
    val_totals = labels_df.iloc[val_indices].sum()

    # To avoid division by zero, we add a small value (1E-7) to the denominator
    ratio = val_totals / (train_totals + 1e-7)
    fold_lbl_distrb.loc[f"split_{n}"] = ratio

In [17]:
import datetime

supported_extensions = [".jpg", ".jpeg", ".png"]

# Initialize an empty list to store image file paths
images = []

# Loop through supported extensions and gather image files
for ext in supported_extensions:
    images.extend(sorted((dataset_path / "train/images").rglob(f"*{ext}")))
    images.extend(sorted((dataset_path / "val/images").rglob(f"*{ext}")))
# Create the necessary directories and dataset YAML files (unchanged)
save_path = Path(dataset_path / f"{datetime.date.today().isoformat()}_{ksplit}-Fold_Cross-val")
save_path.mkdir(parents=True, exist_ok=True)
ds_yamls = []

for split in folds_df.columns:
    # Create directories
    split_dir = save_path / split
    split_dir.mkdir(parents=True, exist_ok=True)
    (split_dir / "train" / "images").mkdir(parents=True, exist_ok=True)
    (split_dir / "train" / "labels").mkdir(parents=True, exist_ok=True)
    (split_dir / "val" / "images").mkdir(parents=True, exist_ok=True)
    (split_dir / "val" / "labels").mkdir(parents=True, exist_ok=True)

    # Create dataset YAML files
    dataset_yaml = split_dir / f"{split}_dataset.yaml"
    ds_yamls.append(dataset_yaml)

    with open(dataset_yaml, "w") as ds_y:
        yaml.safe_dump(
            {
                "path": split_dir.as_posix(),
                "train": "train",
                "val": "val",
                "names": names,
            },
            ds_y,
        )

In [18]:
import shutil

for image, label in zip(images, labels):
    image_stem = image.stem
    if image_stem in folds_df.index:
        for split, k_split in folds_df.loc[image_stem].items():
            # Destination directory
            img_to_path = save_path / split / k_split / "images"
            lbl_to_path = save_path / split / k_split / "labels"
            img_to_path.mkdir(parents=True, exist_ok=True)
            lbl_to_path.mkdir(parents=True, exist_ok=True)
            shutil.copy(image, img_to_path / image.name)
            shutil.copy(label, lbl_to_path / f"{image_stem}.txt")
    else:
        print(f"Warning: {image_stem} not found in folds_df index")



In [19]:
folds_df.to_csv(save_path / "kfold_datasplit.csv")
fold_lbl_distrb.to_csv(save_path / "kfold_label_distribution.csv")

Training with Kfold

In [20]:
import torch
import comet_ml

comet_ml.login(project_name="MEX6_runners")

In [30]:
from ultralytics import YOLO

model = YOLO("yolo11s-seg.pt")

Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11s-seg.pt to 'yolo11s-seg.pt'...


100%|██████████| 19.7M/19.7M [00:00<00:00, 55.9MB/s]


In [21]:
from ultralytics import YOLO

weights_path = "yolo11s-seg.pt"

batch = 32
imgsz = 640
patience = 5
epochs = 150

results = {}

for k in range(ksplit):
    dataset_yaml = ds_yamls[k]
    model = YOLO(weights_path)
    model.train(
        data=dataset_yaml,
        project="MEX6_runners",
        epochs=epochs,
        imgsz=imgsz,
        batch=batch,
        patience=patience,
        name=f"runkfold0{k+1}"
    )
    results[k] = model.metrics  # save output metrics for further analysis

# Save results to a CSV file
results_df = pd.DataFrame(results)
results_df.to_csv(save_path / "kfold_results.csv")

Ultralytics 8.3.43 🚀 Python-3.11.10 torch-2.5.1+cu121 CUDA:0 (NVIDIA A100-SXM4-40GB, 40339MiB)
[34m[1mengine/trainer: [0mtask=segment, mode=train, model=yolo11s-seg.pt, data=/data/students/mary/mlops-exercises/ME6/dataset__all/2024-12-06_3-Fold_Cross-val/split_1/split_1_dataset.yaml, epochs=150, time=None, patience=5, batch=32, imgsz=640, save=True, save_period=-1, cache=False, device=None, workers=8, project=MEX6_runners, name=runkfold01, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, embed=None,

  from .autonotebook import tqdm as notebook_tqdm
2024-12-06 16:18:52,363	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2024-12-06 16:18:52,450	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


Overriding model.yaml nc=80 with nc=24

                   from  n    params  module                                       arguments                     
  0                  -1  1       928  ultralytics.nn.modules.conv.Conv             [3, 32, 3, 2]                 
  1                  -1  1     18560  ultralytics.nn.modules.conv.Conv             [32, 64, 3, 2]                
  2                  -1  1     26080  ultralytics.nn.modules.block.C3k2            [64, 128, 1, False, 0.25]     
  3                  -1  1    147712  ultralytics.nn.modules.conv.Conv             [128, 128, 3, 2]              
  4                  -1  1    103360  ultralytics.nn.modules.block.C3k2            [128, 256, 1, False, 0.25]    
  5                  -1  1    590336  ultralytics.nn.modules.conv.Conv             [256, 256, 3, 2]              
  6                  -1  1    346112  ultralytics.nn.modules.block.C3k2            [256, 256, 1, True]           
  7                  -1  1   1180672  ultralytic

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/marynathalie/mex6-runners/fadb41dfde9c4fadb712c9463dfeff5d

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/raid/students/mary/mlops-exercises/ME6/model_dev' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


[34m[1mTensorBoard: [0mStart with 'tensorboard --logdir MEX6_runners/runkfold01', view at http://localhost:6006/
Freezing layer 'model.23.dfl.conv.weight'
[34m[1mAMP: [0mrunning Automatic Mixed Precision (AMP) checks...
[34m[1mAMP: [0mchecks passed ✅


[34m[1mtrain: [0mScanning /raid/students/mary/mlops-exercises/ME6/dataset__all/2024-12-06_3-Fold_Cross-val/split_1/train/labels... 7450 images, 0 backgrounds, 0 corrupt: 100%|██████████| 7450/7450 [00:04<00:00, 1743.49it/s]






[34m[1mtrain: [0mNew cache created: /raid/students/mary/mlops-exercises/ME6/dataset__all/2024-12-06_3-Fold_Cross-val/split_1/train/labels.cache


[34m[1mval: [0mScanning /raid/students/mary/mlops-exercises/ME6/dataset__all/2024-12-06_3-Fold_Cross-val/split_1/val/labels... 3725 images, 0 backgrounds, 0 corrupt: 100%|██████████| 3725/3725 [00:02<00:00, 1302.51it/s]






[34m[1mval: [0mNew cache created: /raid/students/mary/mlops-exercises/ME6/dataset__all/2024-12-06_3-Fold_Cross-val/split_1/val/labels.cache
Plotting labels to MEX6_runners/runkfold01/labels.jpg... 
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m SGD(lr=0.01, momentum=0.9) with parameter groups 90 weight(decay=0.0), 101 weight(decay=0.0005), 100 bias(decay=0.0)
[34m[1mTensorBoard: [0mmodel graph visualization added ✅
Image sizes 640 train, 640 val
Using 8 dataloader workers
Logging results to [1mMEX6_runners/runkfold01[0m
Starting training for 150 epochs...

      Epoch    GPU_mem   box_loss   seg_loss   cls_loss   dfl_loss  Instances       Size


      1/150      10.6G      2.395      4.488      4.455      2.358        124        640: 100%|██████████| 233/233 [01:43<00:00,  2.25it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Mask(P          R      mAP50  mAP50-95): 100%|██████████| 59/59 [01:22<00:00,  1.40s/it]


                   all       3725       8476      0.346      0.151      0.114     0.0685      0.332      0.138      0.102      0.057

      Epoch    GPU_mem   box_loss   seg_loss   cls_loss   dfl_loss  Instances       Size


      2/150      10.2G      2.315      4.035      3.383      2.222         99        640: 100%|██████████| 233/233 [03:47<00:00,  1.02it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Mask(P          R      mAP50  mAP50-95): 100%|██████████| 59/59 [01:10<00:00,  1.20s/it]


                   all       3725       8476       0.48      0.142      0.124     0.0725      0.438      0.132       0.11     0.0649

      Epoch    GPU_mem   box_loss   seg_loss   cls_loss   dfl_loss  Instances       Size


      3/150      10.2G      2.334       4.13      3.455      2.281        118        640: 100%|██████████| 233/233 [02:14<00:00,  1.73it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Mask(P          R      mAP50  mAP50-95): 100%|██████████| 59/59 [01:43<00:00,  1.75s/it]


                   all       3725       8476      0.435     0.0641     0.0359     0.0172      0.472     0.0526     0.0308     0.0128

      Epoch    GPU_mem   box_loss   seg_loss   cls_loss   dfl_loss  Instances       Size


      4/150      10.3G      2.433      4.231      3.709      2.388        118        640: 100%|██████████| 233/233 [02:32<00:00,  1.53it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95)     Mask(P          R      mAP50  mAP50-95):  86%|████████▋ | 51/59 [01:33<00:14,  1.83s/it]


KeyboardInterrupt: 