# Jupyter de test à supprimer avant le rendu

In [8]:
import random
from PIL import Image
from tqdm import tqdm
from datasets import load_from_disk
from collections import defaultdict
import os
import json
import cv2
import numpy as np
from build_dataset import DATASET_PATH, PROJECT_LABELS

In [23]:
def create_dataset_folders():
    base_path = os.getcwd()  # Get current working directory
    dataset_path = os.path.join(base_path, 'dataset')
    subfolders = ['images_dataset/train/images',
                  'images_dataset/train/labels',
                  'images_dataset/test/images',
                  'images_dataset/test/labels']

    for subfolder in subfolders:
        path = os.path.join(dataset_path, subfolder)
        if not os.path.exists(path):
            os.makedirs(path)
            print(f"Created directory: {path}")
        else:
            print(f"Directory already exists: {path}")


def load_dataset(path):
    try:
        dataset = load_from_disk(path)
        return dataset
    except Exception as e:
        raise FileNotFoundError(f"Dataset not found at {path}. Error: {e}")


def create_image_sublists(dataset, max_images_per_label=3000, max_images_per_group=8, min_images_per_group=4):
    label_images = defaultdict(list)
    labels_keep_index = [PROJECT_LABELS.index(label) for label in PROJECT_LABELS]
    print(labels_keep_index)
    print(len(dataset))
    for item in tqdm(dataset, desc='Create dict for each label : '):
        if item['label'] in labels_keep_index:
            if len(label_images[item['label']]) != max_images_per_label :
                label_images[item['label']].append(item)
    

    # Flatten the list and shuffle
    all_images = [image for images in label_images.values() for image in images]
    random.shuffle(all_images)

    image_groups = []
    pbar = tqdm()
    while len(all_images) >= min_images_per_group:

        group_size = random.randint(min_images_per_group, min(max_images_per_group, len(all_images)))
        group = []

        for _ in range(group_size):
            for i, image in enumerate(all_images):
                group.append(image)
                del all_images[i]
                break

        image_groups.append(group)
        pbar.update(1) 

    return image_groups


def check_overlap(new_box, existing_boxes):
    for box in existing_boxes:
        if not (new_box[2] < box[0] or new_box[0] > box[2] or new_box[3] < box[1] or new_box[1] > box[3]):
            return True
    return False


def create_black_canvas(width, height):
    return Image.new("RGB", (width, height), "black")

def place_images(base_image, images_to_place, image_size_min=72, image_size_max = 92):
    base_w, base_h = base_image.size
    annotations = []
    placed_boxes = []

    for img_data in images_to_place:
        
        img = img_data['image']
        img_label = img_data['label']
        
        image_size = random.randint(image_size_min, image_size_max)

        img_np = np.array(img)
        resized_img_np = cv2.resize(img_np, (image_size, image_size), interpolation=cv2.INTER_CUBIC)
        img = Image.fromarray(resized_img_np)

        while True:
            x = random.randint(0, base_w - image_size)
            y = random.randint(0, base_h - image_size)
            new_box = [x, y, x + image_size, y + image_size]
            
            if not check_overlap(new_box, placed_boxes):
                break

        base_image.paste(img, (x, y))
        placed_boxes.append(new_box)

        x_center, y_center = (x + image_size / 2) / base_w, (y + image_size / 2) / base_h
        norm_width, norm_height = image_size / base_w, image_size / base_h

        annotations.append(f"{img_label} {x_center} {y_center} {norm_width} {norm_height}")

    return base_image, annotations

In [3]:
create_dataset_folders()

Created directory: c:\Users\lorel\Documents\ESME\pencity\ml\dataset\images_dataset/train/images
Created directory: c:\Users\lorel\Documents\ESME\pencity\ml\dataset\images_dataset/train/labels
Created directory: c:\Users\lorel\Documents\ESME\pencity\ml\dataset\images_dataset/test/images
Created directory: c:\Users\lorel\Documents\ESME\pencity\ml\dataset\images_dataset/test/labels


In [4]:
try:
    dataset = load_dataset(DATASET_PATH)
except FileNotFoundError as error:
    print(error)

In [5]:
base_width, base_height = 416, 416

In [6]:
all_images = dataset[('train')]

In [15]:
image_groups = create_image_sublists(all_images, max_images_per_label=3000, max_images_per_group=8, min_images_per_group=4)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
2292196


Create dict for each label : 100%|██████████| 2292196/2292196 [03:26<00:00, 11115.47it/s]
8485it [00:00, 44337.80it/s]


In [16]:
print(len(image_groups))

8485


In [22]:
for images in image_groups :
    print(len(images))
    
    print(images[0]['label'])

5
2
8
3
8
8
7
7
6
15
4
11
6
0
7
11
4
1
7
16
8
2
7
1
5
16
8
6
6
12
8
0
5
3
5
7
8
3
8
3
4
9
5
6
7
6
5
1
7
15
7
13
7
15
7
7
7
6
5
3
6
15
5
15
7
4
4
0
8
3
5
16
5
12
5
2
8
14
7
4
6
5
5
10
7
15
8
5
5
12
7
2
5
2
6
11
8
13
7
0
6
16
5
2
5
9
8
16
7
5
4
9
5
2
4
10
7
5
6
15
7
4
6
11
5
11
8
7
4
9
4
5
5
10
5
4
6
9
8
6
7
11
7
11
5
12
5
3
8
15
8
9
5
5
4
5
7
6
8
14
7
12
5
16
7
6
7
4
8
3
6
8
7
15
4
14
8
12
8
10
6
13
5
7
5
11
8
7
6
10
6
3
5
2
6
12
8
5
4
15
7
16
7
1
7
15
6
8
4
16
5
3
4
8
5
13
8
15
7
13
5
11
8
4
5
11
7
11
8
12
8
13
4
15
8
2
8
2
7
15
5
9
5
0
7
12
7
7
4
12
8
3
6
7
5
10
4
0
4
16
6
16
5
11
6
8
6
12
7
15
5
12
4
1
6
15
4
1
8
10
8
12
8
4
7
13
5
16
8
2
8
6
5
9
6
12
6
11
6
11
7
5
7
1
4
8
8
1
7
0
6
15
5
16
5
4
6
6
8
7
6
16
8
15
7
12
4
4
6
3
5
1
6
7
7
4
5
12
5
4
5
11
8
5
5
15
4
5
8
14
6
15
8
3
7
5
5
2
7
0
7
4
6
13
8
9
4
9
6
12
7
0
8
11
8
10
8
7
8
2
5
1
5
12
7
1
8
2
6
6
5
14
8
1
8
2
8
16
8
1
4
1
5
7
5
0
6
9
5
12
4
15
5
14
6
0
8
0
8
3
6
0
8
11
6
16
4
4
5
6
8
3
4
2
4
4
8
6
4
13
7
10
6
3
5
0
4
16
5
15
6


In [24]:
for index, image_group in enumerate(tqdm(image_groups)):
            base_img = create_black_canvas(base_width, base_height)
            result_img, annotations = place_images(base_img, image_group, image_size=82)

100%|██████████| 8485/8485 [00:04<00:00, 2062.60it/s]
