# Opdracht schilderijen

In [1]:
import tensorflow as tf
import os, shutil, random
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import cv2

## Data exploration

Check hoeveel images per schilder.

In [2]:
def check_amount_of_images(dir_path):
    count = 0
    for path in os.listdir(dir_path):
        # check if current path is a file
        if os.path.isfile(os.path.join(dir_path, path)):
            count += 1
    
    return count

for painter in os.listdir("data"):
    print(f'File count {painter}: {check_amount_of_images(f"data/{painter}")}')

File count Mondriaan: 330
File count Picasso: 1529
File count Rubens: 682


We zien dat de data zeer ongebalanceerd is, we kunnen dit oplossen door het willekeurig kopiëren van samples in de klassen met te weinig samples (undersampled klassen).
De dubbels zullen dan worden weggewerkt met data augmentation technieken.

In [3]:
def balance_dataset(dir_path):
    """
    Random images kopiëren tot dat alle klassen hetzelfde aantal images hebben en dus gebalanceerd zijn.

    Parameters:
        dataset_dir: A string containing the path to a directory containing
        subdirectories to different classes.
    """

    sizes = [check_amount_of_images(f"{dir_path}/{painter}") for painter in os.listdir(dir_path)]
    target_size = max(sizes)
    biggest_class = os.listdir(dir_path)[sizes.index(target_size)]

    for cls in os.listdir(dir_path):
        if (cls != biggest_class):
            
            while check_amount_of_images(f"{dir_path}/{cls}") < target_size:

                random_file = random.choice(os.listdir(f"{dir_path}/{cls}"))
                shutil.copy(f"{dir_path}/{cls}/{random_file}", f"{dir_path}/{cls}/{random.randint(0,1000)}.{random_file}") # random int voor naam zetten anders zou 
                # de image gewoon overgeschreven worden en zou het aantal niet omhoog gaan
                # indien de random_int al bestaat wordt deze gewoon overgeschreven, dus geen nood aan error catch
                

In [4]:
balance_dataset("data")

for painter in os.listdir("data"):
    print(f'File count {painter}: {check_amount_of_images(f"data/{painter}")}')

File count Mondriaan: 1529
File count Picasso: 1529
File count Rubens: 1529


Iedere klasse heeft nu hetzelfde aantal images, de dataset is nu wel gebalanceerd.

### Data cleanup

In [5]:
def rename_files(dir_path):
    """"
    Hernoemen van de files naar de nummering per schilder

    Parameters:
        dataset_dir: A string containing the path to a directory containing
            subdirectories to different classes.
    """
    i = 0

    for path in os.listdir(dir_path):
        # check if current path is a file
        if os.path.isfile(os.path.join(dir_path, path)):
            os.rename(f"{dir_path}/{path}", f"{dir_path}/{i}.jpg")
            i += 1

        
# data cleanup functies toepassen om iedere directory
for painter in os.listdir("data"):
    # remove_duplicates(f"data/{painter}")
    rename_files(f"data/{painter}")

## Inlezen data

In [6]:
def create_data_with_labels(dataset_dir):
    """
    Labelt de data (0 = Mondriaan, 1 = Picasso, 2 = Rubens, ...)
    Print welke files corrupted zijn. (bv. data\Picasso\\145.jpg --> FAILED)

    Parameters:
        dataset_dir: A string containing the path to a directory containing
            subdirectories to different classes.
    Returns:
        de data met de labels
    """
    image_paths_per_label = collect_paths_to_files(dataset_dir)

    images = []
    labels = []
    for label, image_paths in image_paths_per_label.items():
        for image_path in image_paths:

            # print(str(image_path))

            img = cv2.imread(str(image_path))

            if(img is not None):
                # print(f"{i} {str(image_path)} --> succes")
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                images.append(img)
                
                # print(label)
                labels.append(label)       
                      
            else:
                print(f"{str(image_path)} --> FAILED")
                
    # data = np.array([preprocess_image(image.astype(np.float32))
    #             for image in images])
    data = np.array(images)
    labels = np.array(labels)
        
    return data, labels

def collect_paths_to_files(dataset_dir):
    """Returns a dict with labels for each subdirectory of the given directory
    as keys and lists of the subdirectory's contents as values.

    Parameters:
        dataset_dir: A string containing the path to a directory containing
            subdirectories to different classes.
    Returns:
        image_paths_per_label: A dict with labels as keys and lists of file
        paths as values.
    """
    dataset_dir = Path(dataset_dir)
    painter_dirs = [f for f in sorted(os.listdir(dataset_dir)) if not f.startswith('.')]
    image_paths_per_label = {
        label: [
            dataset_dir / painter_dir / '{0}'.format(f)
            for f in os.listdir(dataset_dir / painter_dir) if not f.startswith('.')
        ]
        for label, painter_dir in enumerate(painter_dirs)
    }
    return image_paths_per_label

# def preprocess_image(image):
#     """Returns a preprocessed image.

#     Parameters:
#         image: A RGB image with pixel values in range [0, 255].
#     Returns
#         image: The preprocessed image.
#     """
#     image = image / 255.
#     image = cv2.resize(image, (100, 100))

#     return image


In [7]:
(data, labels) = create_data_with_labels("data/")

data\Picasso\150.jpg --> FAILED
data\Picasso\1503.jpg --> FAILED


  data = np.array(images)


### Statistieken

In [8]:
min_size = (0, 0)
max_size = (0, 0)

min_size_helper = 100_000
max_size_helper = 0

for image in data:
    size = image.shape[0] * image.shape[1] # get pixels of image

    if(size < min_size_helper):
        min_size_helper = size
        min_size = (image.shape[0], image.shape[1])

    if(size > max_size_helper):
        max_size_helper = size
        max_size = (image.shape[0], image.shape[1])


print(f"Min size: {min_size}")
print(f"Max size: {max_size}")

Min size: (136, 136)
Max size: (6000, 5918)
