# Preprocessing des images

## Objectif

Ce notebook contient le code pour faire du preprocessing sur les images.

## Fonctionnement

1. Renseigner les constantes dans la première cellule.
2. Exécuter toutes les cellules.

Note : la fonction ne générera que les images qui ne sont pas présentes dans `OUTPUT_IMG_DIR`. Ainsi, le traitement peut être intérrompu à tout moment. Vous ne perdrez pas l'avancement.

In [27]:
# Constantes liées à l'environnement
DATA_DIR = "data"
INPUT_IMG_DIR = "data/images/image_train/"
OUTPUT_IMG_DIR = "data/images/cropped_50_grayscaled/"

# Constantes liées aux images
IMG_SIZE = (50, 50)
KEEP_RATIO = True
NB_THREADS = 4
GRAYSCALE = True

In [30]:
import os
from threading import Thread
import time
import datetime
from PIL import Image, ImageOps
import numpy as np
from IPython.display import clear_output
from queue import Queue
import queue
from src.data import data, analysis
from src.data.analysis import get_img_name


df = data.load_data(DATA_DIR)

def get_imgs_path(productids: list[int], imageids: list[int]):
    if(len(productids) != len(imageids)):
        raise ValueError("productids and imageids should be the same size") 
    
    return [get_img_name(productid, imageid) for productid, imageid in zip(productids, imageids)]

filenames = get_imgs_path(df.loc[:, "productid"], df.loc[:, "imageid"])



if not os.path.exists(OUTPUT_IMG_DIR):
    os.makedirs(OUTPUT_IMG_DIR)

existing_files = dict.fromkeys(os.listdir(OUTPUT_IMG_DIR))
files_to_process = list(filter(lambda value: value != None, [(x if x not in existing_files else None) for x in filenames]))

class Progression():
    def __init__(self, total_rows:int):
        self.start_time = time.perf_counter()
        self.total_rows = total_rows

    def display(self, remaining_rows_number: int):
        current_row_number = self.total_rows - remaining_rows_number
        if current_row_number == 0:
            return
        time_diff = time.perf_counter() - self.start_time
        time_per_row = time_diff / current_row_number
        remaining_time = (self.total_rows - current_row_number) * time_per_row

        clear_output(wait=True)
        print("Avancement : ", np.round(current_row_number/self.total_rows*100, 2), "%")
        print("Temps restant :", datetime.timedelta(seconds=int(remaining_time)))

    def done(self):
        clear_output(wait=True)
        print("Avancement : 100%")

def crop_resize(filename:str, imput_img_dir:str, output_img_dir:str, image_size, keep_ratio:bool, grayscale:bool=False) -> None:
    img_array = np.asarray(Image.open(imput_img_dir + filename))
    top_line = -1
    right_line = -1
    bottom_line = -1
    left_line = -1

    i = 1
    while (top_line == -1 or bottom_line == -1
            or left_line == -1 or right_line == -1):
        if top_line == -1 and img_array[:i].mean() != 255:
            top_line = i
        if bottom_line == -1 and img_array[-i:].mean() != 255:
            bottom_line = i
        if left_line == -1 and img_array[:, :i].mean() != 255:
            left_line = i
        if right_line == -1 and img_array[:, -i:].mean() != 255:
            right_line = i
        
        i += 1
        if i >= img_array.shape[0]:
            break

    if(top_line == -1 or bottom_line == -1
       or left_line == -1 or right_line == -1):
        new_img_array = img_array
    else:
        new_img_array = img_array[top_line:-bottom_line, 
                              left_line:-right_line,
                              :]
    new_img = Image.fromarray(new_img_array)

    if keep_ratio:
        new_width = new_img.width
        new_height = new_img.height

        ratio = new_width - new_height
        padding_value = np.abs(ratio) // 2
        padding = ()
        if ratio > 0:
            padding = (0, padding_value, 0, padding_value)
        else:
            padding = (padding_value, 0, padding_value, 0)
            
        new_img = ImageOps.expand(new_img, padding, (255, 255, 255))

    new_img = new_img.resize(image_size)

    if grayscale:
        new_img = ImageOps.grayscale(new_img)

    new_img.save(output_img_dir + filename)

def initiate_crop_resize(queue:Queue, imput_img_dir:str, output_img_dir:str, image_size, keep_ratio:bool, grayscale:bool):
    while not queue.empty():
        filename = queue.get()
        crop_resize(filename, imput_img_dir, output_img_dir, image_size, keep_ratio, grayscale)
    
filenames_queue = Queue()
filenames_queue.queue = queue.deque(files_to_process)

nb_total_imgs = filenames_queue.qsize()

threads = []
for i in range(NB_THREADS):
    threads.append(Thread(target=initiate_crop_resize, args=(filenames_queue, INPUT_IMG_DIR, OUTPUT_IMG_DIR, IMG_SIZE, KEEP_RATIO, GRAYSCALE)))

for thread in threads:
    thread.start()

progress = Progression(nb_total_imgs)
while not filenames_queue.empty():
    is_alive = False
    for thread in threads:
        if thread.is_alive():
            is_alive = True
            break
    if is_alive == False:
        break
    progress.display(filenames_queue.qsize())
    time.sleep(3)

for thread in threads:
    thread.join()

progress.done()

Avancement : 100%
