In [4]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import shutil
from tqdm import tqdm


In [5]:
TACO_DATASET_ROOT_PATH = r"N:\.shortcut-targets-by-id\1snQvqDK4hGCn0Aj8Z9ue27atrFsCtFln\Drive KESK'IA\POC - C Du Propre\dataset\taco-2gb"
NEW_TACO_DATASET_PATH = r"N:\.shortcut-targets-by-id\1snQvqDK4hGCn0Aj8Z9ue27atrFsCtFln\Drive KESK'IA\POC - C Du Propre\dataset\taco-2gb-updated"

# print tree structure
for root, dirs, files in os.walk(TACO_DATASET_ROOT_PATH):
    level = root.replace(TACO_DATASET_ROOT_PATH, '').count(os.sep)
    indent = ' ' * 4 * (level)
    print('{}{}/'.format(indent, os.path.basename(root)))
    subindent = ' ' * 4 * (level + 1)
    # nb of files in each folder
    print('{}{} img files'.format(subindent, len(files)))

taco-2gb/
    4 img files
    data/
        2 img files
        batch_1/
            102 img files
        batch_10/
            101 img files
        batch_11/
            101 img files
        batch_12/
            101 img files
        batch_13/
            101 img files
        batch_14/
            101 img files
        batch_15/
            86 img files
        batch_2/
            93 img files
        batch_3/
            98 img files
        batch_4/
            90 img files
        batch_5/
            113 img files
        batch_6/
            98 img files
        batch_7/
            128 img files
        batch_8/
            101 img files
        batch_9/
            101 img files


## TACO dataset

In [6]:
taco_meta_df = pd.read_csv(os.path.join(TACO_DATASET_ROOT_PATH, "meta_df.csv"))
taco_meta_df

Unnamed: 0,img_id,img_width,img_height,img_file,cat_id,cat_name,supercategory,ann_id,x,y,width,height,area
0,0,1537,2049,batch_1/000006.jpg,6,Glass bottle,Bottle,1,517.0000,127.0000,447.0000,1322.0,4.039540e+05
1,1,1537,2049,batch_1/000008.jpg,18,Meal carton,Carton,2,1.0000,457.0000,1429.0000,1519.0,1.071260e+06
2,1,1537,2049,batch_1/000008.jpg,14,Other carton,Carton,3,531.0000,292.0000,1006.0000,672.0,9.958350e+04
3,2,1537,2049,batch_1/000010.jpg,5,Clear plastic bottle,Bottle,4,632.0000,987.0000,500.0000,374.0,7.383250e+04
4,2,1537,2049,batch_1/000010.jpg,7,Plastic bottle cap,Bottle cap,5,632.0000,989.0000,44.0000,51.0,9.150000e+02
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4779,1498,1824,4000,batch_9/000098.jpg,16,Drink carton,Carton,4779,228.7143,1550.0476,1007.9999,579.0,5.032469e+05
4780,1498,1824,4000,batch_9/000098.jpg,7,Plastic bottle cap,Bottle cap,4780,1041.3334,1721.7142,141.0000,138.0,1.449450e+04
4781,1499,1824,4000,batch_9/000099.jpg,39,Other plastic wrapper,Plastic bag & wrapper,4781,862.0274,1331.2500,505.9726,612.5,1.549541e+05
4782,1499,1824,4000,batch_9/000099.jpg,14,Other carton,Carton,4782,966.0000,1996.0000,211.0000,336.0,2.867500e+04


In [7]:
taco_meta_df["path"] = taco_meta_df["img_file"].apply(lambda x: os.path.join(TACO_DATASET_ROOT_PATH, "data", x))

# add new img file path (replace '/' by '_')
taco_meta_df["img_file"] = taco_meta_df["img_file"].apply(lambda x: x.replace("/", "_"))

In [8]:
taco_meta_df.columns

Index(['img_id', 'img_width', 'img_height', 'img_file', 'cat_id', 'cat_name',
       'supercategory', 'ann_id', 'x', 'y', 'width', 'height', 'area', 'path'],
      dtype='object')

## Convert to yolo format

In [9]:
# Calcul et ajout des nouvelles colonnes normalisées
taco_meta_df['x_center_norm'] = (taco_meta_df['x'] + taco_meta_df['width'] / 2) / taco_meta_df['img_width']
taco_meta_df['y_center_norm'] = (taco_meta_df['y'] + taco_meta_df['height'] / 2) / taco_meta_df['img_height']
taco_meta_df['width_norm'] = taco_meta_df['width'] / taco_meta_df['img_width']
taco_meta_df['height_norm'] = taco_meta_df['height'] / taco_meta_df['img_height']

## Split train/validation/test

In [10]:
from sklearn.model_selection import train_test_split


def split_dataset(df, val_size=0.1, test_size=0.2, random_state=123, stratify_by=None):
    """Split dataset into train, validation, and test sets.

    Args:
        df: Dataframe to split.
        val_size: Size of the validation set.
        test_size: Size of the test set.
        random_state: Random state for reproducibility.
        stratify_by: Stratify by column name.

    Returns:
        Tuple of train, validation, and test sets.
    """
    # Ensure stratify_by is a valid column
    if stratify_by not in df.columns:
        stratify_by = None

    # Adjust val_size to reflect the proportion of the remaining dataset after test split
    val_size_adjusted = val_size / (1 - test_size)

    # Splitting the dataset into training + validation and test sets
    train_val_df, test_df = train_test_split(
        df,
        test_size=test_size,
        random_state=random_state,
        stratify=df[stratify_by] if stratify_by else None
    )

    # Splitting the training + validation set into training and validation sets
    train_df, val_df = train_test_split(
        train_val_df,
        test_size=val_size_adjusted,
        random_state=random_state,
        stratify=train_val_df[stratify_by] if stratify_by else None
    )

    return train_df, val_df, test_df


# Example usage
train_df, val_df, test_df = split_dataset(taco_meta_df, stratify_by="supercategory", val_size=0.15, test_size=0.1)
print(f"train_df: {len(train_df)} ({len(train_df) / len(taco_meta_df) * 100:.1f}%)")
print(f"val_df: {len(val_df)} ({len(val_df) / len(taco_meta_df) * 100:.1f}%)")
print(f"test_df: {len(test_df)} ({len(test_df) / len(taco_meta_df) * 100:.1f}%)")

train_df: 3587 (75.0%)
val_df: 718 (15.0%)
test_df: 479 (10.0%)


In [11]:
train_df["split"] = "train"
val_df["split"] = "val"
test_df["split"] = "test"

train_df["path"] = train_df["img_file"].apply(lambda x: os.path.join("train", "images", x))
val_df["path"] = val_df["img_file"].apply(lambda x: os.path.join("val", "images", x))
test_df["path"] = test_df["img_file"].apply(lambda x: os.path.join("test", "images", x))

new_taco_meta_df = pd.concat([train_df, val_df, test_df], ignore_index=True)
new_taco_meta_df

Unnamed: 0,img_id,img_width,img_height,img_file,cat_id,cat_name,supercategory,ann_id,x,y,width,height,area,path,x_center_norm,y_center_norm,width_norm,height_norm,split
0,457,3120,4160,batch_13_000056.jpg,27,Plastic lid,Lid,1620,1521.0,883.0,544.0,188.0,73468.0,train\images\batch_13_000056.jpg,0.574679,0.234856,0.174359,0.045192,train
1,1365,2448,3264,batch_8_000065.jpg,9,Broken glass,Broken glass,4381,1050.0,2530.0,71.0,55.0,2074.5,train\images\batch_8_000065.jpg,0.443423,0.783548,0.029003,0.016850,train
2,766,3264,2448,batch_2_000089.JPG,36,Plastic film,Plastic bag & wrapper,2378,1641.0,859.0,610.0,180.0,66199.0,train\images\batch_2_000089.JPG,0.596201,0.387663,0.186887,0.073529,train
3,1112,3264,2448,batch_6_000066.JPG,59,Cigarette,Cigarette,3590,1074.0,658.0,12.0,16.0,140.0,train\images\batch_6_000066.JPG,0.330882,0.272059,0.003676,0.006536,train
4,275,4032,3024,batch_11_000074.jpg,36,Plastic film,Plastic bag & wrapper,847,3007.0,51.0,122.0,158.0,7292.0,train\images\batch_11_000074.jpg,0.760913,0.042989,0.030258,0.052249,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4779,1226,3264,2448,batch_7_000064.JPG,36,Plastic film,Plastic bag & wrapper,3809,793.0,2266.0,108.0,72.0,4724.0,test\images\batch_7_000064.JPG,0.259498,0.940359,0.033088,0.029412,test
4780,1081,3264,2448,batch_6_000100.JPG,5,Clear plastic bottle,Bottle,3244,994.0,579.0,608.0,315.0,98612.5,test\images\batch_6_000100.JPG,0.397672,0.300858,0.186275,0.128676,test
4781,1425,2448,3264,batch_9_000025.jpg,39,Other plastic wrapper,Plastic bag & wrapper,4538,716.0,1443.0,400.0,321.0,41103.0,test\images\batch_9_000025.jpg,0.374183,0.491268,0.163399,0.098346,test
4782,719,2448,3264,batch_2_000039.JPG,58,Unlabeled litter,Unlabeled litter,2279,1238.0,730.0,79.0,78.0,3808.0,test\images\batch_2_000039.JPG,0.521855,0.235600,0.032271,0.023897,test


In [21]:
new_taco_meta_df.columns

Index(['img_id', 'img_width', 'img_height', 'img_file', 'cat_id', 'cat_name',
       'supercategory', 'ann_id', 'x', 'y', 'width', 'height', 'area', 'path',
       'x_center_norm', 'y_center_norm', 'width_norm', 'height_norm', 'split'],
      dtype='object')

## Generate yaml file

In [None]:
def generate_yaml_file(path : dit

## Rearrange directory structure (train/val/test) 
yolo format 

**Objectif** : 
- train
    - images
    - labels
- val
- test

In [8]:
train_df

Unnamed: 0,img_id,img_width,img_height,img_file,cat_id,cat_name,supercategory,ann_id,x,y,width,height,area,path,x_center_norm,y_center_norm,width_norm,height_norm
3819,1224,2448,3264,batch_7_000062.JPG,58,Unlabeled litter,Unlabeled litter,3803,1086.0,760.0,70.0,84.0,2829.0,N:\.shortcut-targets-by-id\1snQvqDK4hGCn0Aj8Z9...,0.457925,0.245711,0.028595,0.025735
3418,1112,3264,2448,batch_6_000066.JPG,59,Cigarette,Cigarette,3597,1250.0,800.0,16.0,12.0,84.0,N:\.shortcut-targets-by-id\1snQvqDK4hGCn0Aj8Z9...,0.385417,0.329248,0.004902,0.004902
489,137,1824,4000,batch_10_000036.jpg,59,Cigarette,Cigarette,489,1324.0,2390.0,14.0,29.0,140.0,N:\.shortcut-targets-by-id\1snQvqDK4hGCn0Aj8Z9...,0.729715,0.601125,0.007675,0.007250
3007,1005,3264,2448,batch_5_000106.JPG,36,Plastic film,Plastic bag & wrapper,2999,1364.0,682.0,1139.0,102.0,72266.5,N:\.shortcut-targets-by-id\1snQvqDK4hGCn0Aj8Z9...,0.592371,0.299428,0.348958,0.041667
4401,1369,3264,2448,batch_8_000069.jpg,39,Other plastic wrapper,Plastic bag & wrapper,4401,2550.0,1593.0,81.0,39.0,1714.0,N:\.shortcut-targets-by-id\1snQvqDK4hGCn0Aj8Z9...,0.793658,0.658701,0.024816,0.015931
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3609,1161,2448,3264,batch_6_000014.JPG,59,Cigarette,Cigarette,3631,1680.0,2208.0,42.0,52.0,754.0,N:\.shortcut-targets-by-id\1snQvqDK4hGCn0Aj8Z9...,0.694853,0.684436,0.017157,0.015931
1136,346,2976,3968,batch_12_000045.jpg,5,Clear plastic bottle,Bottle,1136,1377.0,1499.0,94.0,132.0,7233.0,N:\.shortcut-targets-by-id\1snQvqDK4hGCn0Aj8Z9...,0.478495,0.394405,0.031586,0.033266
3081,1032,2448,3264,batch_5_000015.JPG,12,Drink can,Can,3070,1125.0,1571.0,210.0,105.0,7589.0,N:\.shortcut-targets-by-id\1snQvqDK4hGCn0Aj8Z9...,0.502451,0.497396,0.085784,0.032169
2326,727,2448,3264,batch_2_000048.JPG,59,Cigarette,Cigarette,2423,910.0,316.0,44.0,10.0,374.0,N:\.shortcut-targets-by-id\1snQvqDK4hGCn0Aj8Z9...,0.380719,0.098346,0.017974,0.003064


In [None]:
for i, df in enumerate([train_df, val_df, test_df]):
    df_name = ["train", "val", "test"][i]
    for object_group in tqdm(train_df.groupby("path"), desc="Processing {}".format(df_name)):
        # copy image 
        orig_image_path = object_group[0]
        label_matrix = object_group[1][["cat_id", "x_center_norm", "y_center_norm", "width_norm", "height_norm"]].values
        image_name = object_group[1]["img_file"].values[0]

        # copy image
        new_image_path = os.path.join(NEW_TACO_DATASET_PATH, "train", "images", image_name)
        if not os.path.exists(os.path.dirname(new_image_path)):
            os.makedirs(os.path.dirname(new_image_path))
        shutil.copy(orig_image_path, new_image_path)

        # create label file (yolo format) .txt 
        new_label_path = os.path.join(NEW_TACO_DATASET_PATH, "train", "labels",
                                      os.path.splitext(image_name)[0] + ".txt")
        if not os.path.exists(os.path.dirname(new_label_path)):
            os.makedirs(os.path.dirname(new_label_path))
        with open(new_label_path, "w") as f:
            for label in label_matrix:
                f.write("{} {} {} {} {}\n".format(int(label[0]), float(label[1]), float(label[2]), float(label[3]),
                                                  float(label[4])))


Processing train:  93%|█████████▎| 1220/1310 [10:38<00:56,  1.60it/s]  

## Create new meta df 

analyse the new dataset

In [19]:
NEW_TACO_DATASET_PATH

"N:\\.shortcut-targets-by-id\\1snQvqDK4hGCn0Aj8Z9ue27atrFsCtFln\\Drive KESK'IA\\POC - C Du Propre\\dataset\\taco-2gb-updated"

In [None]:
import os
import pandas as pd
from PIL import Image

def analyze_yolo_dataset(base_path):
    columns = ['img_width', 'img_height', 'img_file', 'path', 'cat_id', 'cat_name', 'ann_id', 
               'x', 'y', 'width', 'height', 'area', 'x_center_norm', 'y_center_norm', 
               'width_norm', 'height_norm', 'split']
    data = []

    for split in ['train', 'val', 'test']:
        images_path = os.path.join(base_path, split, 'images')
        labels_path = os.path.join(base_path, split, 'labels')

        for img_file in os.listdir(images_path):
            if img_file.lower().endswith(('.png', '.jpg', '.jpeg')):
                img_path = os.path.join(images_path, img_file)
                label_path = os.path.join(labels_path, img_file.rsplit('.', 1)[0] + '.txt')

                # Get image dimensions
                with Image.open(img_path) as img:
                    img_width, img_height = img.size

                # Read label file
                if os.path.exists(label_path):
                    with open(label_path, 'r') as file:
                        for line in file:
                            cat_id, x, y, width, height = map(float, line.strip().split())
                            area = width * height
                            x_center_norm = (x + width / 2) / img_width
                            y_center_norm = (y + height / 2) / img_height
                            width_norm = width / img_width
                            height_norm = height / img_height

                            data.append([img_width, img_height, img_file, img_path, cat_id, '', '', 
                                         x, y, width, height, area, x_center_norm, y_center_norm, 
                                         width_norm, height_norm, split])

    df = pd.DataFrame(data, columns=columns)
    return df

# Usage
base_path = 'your_yolo_dataset_directory'  # Adjust this to your YOLO dataset directory
df = analyze_yolo_dataset(base_path)

# Optionally save to CSV
df.to_csv('yolo_dataset_analysis.csv', index=False)


## Save 

### save train/validation/test meta df

### save cat_name dict

In [12]:
cat_id_name_map = new_taco_meta_df[["cat_id", "cat_name"]].drop_duplicates().set_index("cat_id").to_dict()["cat_name"]

# sort keys
cat_id_name_map = {k: v for k, v in sorted(cat_id_name_map.items(), key=lambda item: item[0])}
cat_id_name_map

{0: 'Aluminium foil',
 1: 'Battery',
 2: 'Aluminium blister pack',
 3: 'Carded blister pack',
 4: 'Other plastic bottle',
 5: 'Clear plastic bottle',
 6: 'Glass bottle',
 7: 'Plastic bottle cap',
 8: 'Metal bottle cap',
 9: 'Broken glass',
 10: 'Food Can',
 11: 'Aerosol',
 12: 'Drink can',
 13: 'Toilet tube',
 14: 'Other carton',
 15: 'Egg carton',
 16: 'Drink carton',
 17: 'Corrugated carton',
 18: 'Meal carton',
 19: 'Pizza box',
 20: 'Paper cup',
 21: 'Disposable plastic cup',
 22: 'Foam cup',
 23: 'Glass cup',
 24: 'Other plastic cup',
 25: 'Food waste',
 26: 'Glass jar',
 27: 'Plastic lid',
 28: 'Metal lid',
 29: 'Other plastic',
 30: 'Magazine paper',
 31: 'Tissues',
 32: 'Wrapping paper',
 33: 'Normal paper',
 34: 'Paper bag',
 36: 'Plastic film',
 37: 'Six pack rings',
 38: 'Garbage bag',
 39: 'Other plastic wrapper',
 40: 'Single-use carrier bag',
 41: 'Polypropylene bag',
 42: 'Crisp packet',
 43: 'Spread tub',
 44: 'Tupperware',
 45: 'Disposable food container',
 46: 'Foam f

In [13]:
# reindex cat_id
cat_id_name_map = {k: v for k, v in enumerate(cat_id_name_map.values())}
cat_id_name_map

{0: 'Aluminium foil',
 1: 'Battery',
 2: 'Aluminium blister pack',
 3: 'Carded blister pack',
 4: 'Other plastic bottle',
 5: 'Clear plastic bottle',
 6: 'Glass bottle',
 7: 'Plastic bottle cap',
 8: 'Metal bottle cap',
 9: 'Broken glass',
 10: 'Food Can',
 11: 'Aerosol',
 12: 'Drink can',
 13: 'Toilet tube',
 14: 'Other carton',
 15: 'Egg carton',
 16: 'Drink carton',
 17: 'Corrugated carton',
 18: 'Meal carton',
 19: 'Pizza box',
 20: 'Paper cup',
 21: 'Disposable plastic cup',
 22: 'Foam cup',
 23: 'Glass cup',
 24: 'Other plastic cup',
 25: 'Food waste',
 26: 'Glass jar',
 27: 'Plastic lid',
 28: 'Metal lid',
 29: 'Other plastic',
 30: 'Magazine paper',
 31: 'Tissues',
 32: 'Wrapping paper',
 33: 'Normal paper',
 34: 'Paper bag',
 35: 'Plastic film',
 36: 'Six pack rings',
 37: 'Garbage bag',
 38: 'Other plastic wrapper',
 39: 'Single-use carrier bag',
 40: 'Polypropylene bag',
 41: 'Crisp packet',
 42: 'Spread tub',
 43: 'Tupperware',
 44: 'Disposable food container',
 45: 'Foam f

In [19]:
idx_to_supercategory = {k: v for k, v in enumerate(taco_meta_df["supercategory"].drop_duplicates().to_dict().values())}
idx_to_supercategory

{0: 'Bottle',
 1: 'Carton',
 2: 'Bottle cap',
 3: 'Can',
 4: 'Pop tab',
 5: 'Cup',
 6: 'Plastic bag & wrapper',
 7: 'Styrofoam piece',
 8: 'Other plastic',
 9: 'Plastic container',
 10: 'Paper',
 11: 'Cigarette',
 12: 'Lid',
 13: 'Straw',
 14: 'Paper bag',
 15: 'Broken glass',
 16: 'Plastic utensils',
 17: 'Glass jar',
 18: 'Food waste',
 19: 'Squeezable tube',
 20: 'Shoe',
 21: 'Aluminium foil',
 22: 'Unlabeled litter',
 23: 'Blister pack',
 24: 'Battery',
 25: 'Rope & strings',
 26: 'Scrap metal',
 27: 'Plastic glooves'}

In [3]:
# get from pandas 
from constants import CATIDX_TO_CATNAME
for i in range(60):
    print(i,CATIDX_TO_CATNAME[i])

0 Aluminium foil
1 Battery
2 Aluminium blister pack
3 Carded blister pack
4 Other plastic bottle
5 Clear plastic bottle
6 Glass bottle
7 Plastic bottle cap
8 Metal bottle cap
9 Broken glass
10 Food Can
11 Aerosol
12 Drink can
13 Toilet tube
14 Other carton
15 Egg carton
16 Drink carton
17 Corrugated carton
18 Meal carton
19 Pizza box
20 Paper cup
21 Disposable plastic cup
22 Foam cup
23 Glass cup
24 Other plastic cup
25 Food waste
26 Glass jar
27 Plastic lid
28 Metal lid
29 Other plastic
30 Magazine paper
31 Tissues
32 Wrapping paper
33 Normal paper
34 Paper bag


KeyError: 35