In [1]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import to_categorical
import numpy as np
import pandas as pd
import os
from PIL import Image
from tqdm import tqdm

2024-02-24 14:03:18.170671: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-24 14:03:18.170753: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-24 14:03:18.324338: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
def load_images(split_dir):
    images = {}

    for p in os.listdir(split_dir):
        p_dir = os.path.join(split_dir, p)
        if not os.path.isdir(p_dir):
            continue

        p_images = []
        for image_path in os.listdir(p_dir):
            p_images.append(os.path.join(p_dir, image_path))

        images[p] = p_images
    return images

def assign_labels(images_dict, labels_dict):
    images = []
    labels = []
    for p, p_images in images_dict.items():
        images.extend(p_images)
        labels.extend([labels_dict[p]] * len(p_images))
    return images, labels

In [3]:
data_dir = '/kaggle/input/dlmi-lymphocytosis/dlmi-lymphocytosis-classification/'
output_dir = '/kaggle/working/dlmi-lymphocytosis-augmented-data/'

In [4]:
train_dir = data_dir + "trainset"
trainset_true_df = pd.read_csv(data_dir + "trainset/trainset_true.csv")
labels_dict = dict(zip(trainset_true_df["ID"], trainset_true_df["LABEL"]))

In [5]:
images_names_dict = load_images(train_dir)
images_names, images_labels = assign_labels(images_names_dict, labels_dict)
images_names, images_labels = np.array(images_names), np.array(images_labels)

In [6]:
n_images_per_class = 2_000
images_names_0 = images_names[images_labels == 0]
images_names_1 = images_names[images_labels == 1]
selected_images_names_0 = np.random.choice(images_names_0, size=n_images_per_class, replace=False)
selected_images_names_1 = np.random.choice(images_names_1, size=n_images_per_class, replace=False)

In [9]:
images = []
labels = []

for images_names, label in zip([selected_images_names_0, selected_images_names_1], [0, 1]):
    for img_path in tqdm(images_names, desc=f'Chargement des images [{label}]'):
        img = Image.open(img_path)
        img_array = np.array(img)
        images.append(img_array)
        labels.append(label)

images = np.array(images)
labels = np.array(labels)

Chargement des images [0]: 100%|██████████| 2000/2000 [00:14<00:00, 133.63it/s]
Chargement des images [1]: 100%|██████████| 2000/2000 [00:13<00:00, 152.17it/s]


In [10]:
labels = to_categorical(labels, num_classes=2)

datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

augmented_images = []
augmented_labels = []

for i in tqdm(range(images.shape[0]), desc='Augmentation des images'):
    img = images[i]
    label = labels[i]

    img = img.reshape((1,) + img.shape)  # Reshape pour le générateur

    j = 0
    for batch in datagen.flow(img, batch_size=1, save_to_dir=output_dir, save_prefix='aug', save_format='jpg'):
        augmented_images.append(batch[0])
        augmented_labels.append(label)

        j += 1
        if j >= 5:
            break


Augmentation des images: 100%|██████████| 4000/4000 [03:20<00:00, 19.99it/s]


In [1]:
import zipfile
import os

dossier_a_compresser = '/kaggle/working/dlmi-lymphocytosis-augmented-data/'

fichier_zip = '/kaggle/working/augmented-data.zip'

with zipfile.ZipFile(fichier_zip, 'w') as zipf:
    for dossier_racine, sous_dossiers, fichiers in os.walk(dossier_a_compresser):
        for fichier in fichiers:
            chemin_complet = os.path.join(dossier_racine, fichier)
            arcname = os.path.relpath(chemin_complet, dossier_a_compresser)
            zipf.write(chemin_complet, arcname=arcname)

OSError: [Errno 28] No space left on device

In [None]:
from IPython.display import FileLink
FileLink(r'./augmented-data.zip')