# Opdracht schilderijen

In [None]:
import tensorflow as tf
import os, shutil, pathlib
import random
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import cv2

## Opzetten omgeving

Ik heb gewerkt met mijn lokale GPU. Op dit op te zetten maakte ik gebruik van deze [bron](https://towardsdatascience.com/the-ultimate-tensorflow-gpu-installation-guide-for-2022-and-beyond-27a88f5e6c6e). Ik downloadde de CUDA toolkit en CuDNN, maar maakte geen gebruik van anaconda. Ik heb dit zowel op mijn laptop als op mijn desktop proberen op te zetten. Op mijn laptop is dit zonder enige problemen gelukt, om mijn desktop is dit uiteindelijk niet gelukt.

Ik maakte gebruik van de ingebouwde NVIDIA Geforce RTX 2060 in mijn laptop.

## Data exploration

Check hoeveel images per schilder.

In [None]:
def check_amount_of_images(dir_path):
    count = 0
    for path in os.listdir(dir_path):
        # check if current path is a file
        if os.path.isfile(os.path.join(dir_path, path)):
            count += 1
    
    return count

for painter in os.listdir("data"):
    print(f'File count {painter}: {check_amount_of_images(f"data/{painter}")}')

We zien dat de data zeer ongebalanceerd is, we kunnen dit oplossen door het willekeurig kopiëren van samples in de klassen met te weinig samples (oversampled methode).
De dubbels zullen dan worden weggewerkt met data augmentation technieken.

We kunnen ook schilderijen van Picasso achterwege laten tot we enzelfde aantal schilderijen hebben in iedere klasse (undersampled methode), maar dit lijkt me jammer dat we deze schilderijen hebben maar toch niet zouden gebruiken voor het model.
We kunnen experimenteren met beide, maar ik zal beginnen met de oversampled methode.

In [None]:
def balance_dataset(dir_path):
    """
    Random images kopiëren tot dat alle klassen hetzelfde aantal images hebben en dus gebalanceerd zijn.

    Parameters:
        dataset_dir: A string containing the path to a directory containing
        subdirectories to different classes.
    """

    sizes = [check_amount_of_images(f"{dir_path}/{painter}") for painter in os.listdir(dir_path)]
    target_size = max(sizes)
    biggest_class = os.listdir(dir_path)[sizes.index(target_size)]

    for cls in os.listdir(dir_path):
        if (cls != biggest_class):
            
            while check_amount_of_images(f"{dir_path}/{cls}") < target_size:

                random_file = random.choice(os.listdir(f"{dir_path}/{cls}"))
                shutil.copy(f"{dir_path}/{cls}/{random_file}", f"{dir_path}/{cls}/{random.randint(0,1000)}.{random_file}") # random int voor naam zetten anders zou 
                # de image gewoon overgeschreven worden en zou het aantal niet omhoog gaan
                # indien de random_int al bestaat wordt deze gewoon overgeschreven, dus geen nood aan error catch
                

In [None]:
balance_dataset("data")

for painter in os.listdir("data"):
    print(f'File count {painter}: {check_amount_of_images(f"data/{painter}")}')

Iedere klasse heeft nu hetzelfde aantal images, de dataset is nu wel gebalanceerd.

### Data cleanup

In [None]:
def rename_files(dir_path):
    """"
    Hernoemen van de files naar de nummering per schilder, dit formaat is nodig voor het goed functioneren van een latere functie (make_subset)

    Parameters:
        dataset_dir: A string containing the path to a directory containing
            subdirectories to different classes.
    """
    i = 0

    for path in os.listdir(dir_path):
        # check if current path is a file
        if os.path.isfile(os.path.join(dir_path, path)):
            os.rename(f"{dir_path}/{path}", f"{dir_path}/{i}.jpg")
            i += 1

        
# data cleanup functies toepassen om iedere directory
for painter in os.listdir("data"):
    # remove_duplicates(f"data/{painter}")
    rename_files(f"data/{painter}")

### Opsplitsen van de data in train-, validatie en testset

In [None]:
original_dir = pathlib.Path("data")
new_base_dir = pathlib.Path("dataset")

def make_subset(subset_name, start_index, end_index):
    for category in os.listdir("data"):
        dir = new_base_dir / subset_name / category
        os.makedirs(dir)
        fnames = [f"{i}.jpg" for i in range(start_index, end_index)]
        for fname in fnames:
            shutil.copyfile(src=original_dir / category / fname,
            dst=dir / fname)

make_subset("train", start_index=0, end_index=918) # 60% trainingsset
make_subset("validation", start_index=918, end_index=1223) # 20% validatieset
make_subset("test", start_index=1223, end_index=1529) # 20% testset

# make_subset("train", start_index=0, end_index=400) # 60% trainingsset
# make_subset("validation", start_index=400, end_index=500) # 20% validatieset
# make_subset("test", start_index=500, end_index=600) # 20% testset

## Inlezen, preprocessen & labelen van de data

In [None]:
def create_data_with_labels(dataset_dir):
    """
    Labelt de data (0 = Mondriaan, 1 = Picasso, 2 = Rubens, ...)
    Print welke files corrupted zijn. (bv. data\Picasso\\145.jpg --> FAILED)

    Parameters:
        dataset_dir: A string containing the path to a directory containing
            subdirectories to different classes.
    Returns:
        de data met de labels
    """
    image_paths_per_label = collect_paths_to_files(dataset_dir)

    images = []
    labels = []
    for label, image_paths in image_paths_per_label.items():
        for image_path in image_paths:

            # print(str(image_path))

            img = cv2.imread(str(image_path))

            if(img is not None):
                # print(f"{i} {str(image_path)} --> succes")
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                images.append(img)
                
                # print(label)
                labels.append(label)       
                      
            else:
                print(f"{str(image_path)} --> FAILED")
                
    data = np.array([preprocess_image(image.astype(np.float32))
                for image in images])
    
    labels = np.array(labels)
        
    return data, labels

def collect_paths_to_files(dataset_dir):
    """Returns a dict with labels for each subdirectory of the given directory
    as keys and lists of the subdirectory's contents as values.

    Parameters:
        dataset_dir: A string containing the path to a directory containing
            subdirectories to different classes.
    Returns:
        image_paths_per_label: A dict with labels as keys and lists of file
        paths as values.
    """
    dataset_dir = Path(dataset_dir)
    painter_dirs = [f for f in sorted(os.listdir(dataset_dir)) if not f.startswith('.')]
    image_paths_per_label = {
        label: [
            dataset_dir / painter_dir / '{0}'.format(f)
            for f in os.listdir(dataset_dir / painter_dir) if not f.startswith('.')
        ]
        for label, painter_dir in enumerate(painter_dirs)
    }
    return image_paths_per_label

def preprocess_image(image):
    """Returns a preprocessed image.

    Parameters:
        image: A RGB image with pixel values in range [0, 255].
    Returns
        image: The preprocessed image.
    """

    image = cv2.resize(image, (180, 180))
    image = image / 255.
    
    return image


In [None]:
(train_data, train_labels) = create_data_with_labels("dataset/train/")
(val_data, val_labels) = create_data_with_labels("dataset/validation")
(test_data, test_labels) = create_data_with_labels("dataset/test")

In [None]:
train_data.shape

Ter controle, we zien dat de het resizen naar 180x180 gelukt is.