# Opdracht schilderijen

In [27]:
import tensorflow as tf
import os
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import cv2

## Data cleanup

Verwijderen van duplicaten en nummeren per schilder

In [28]:
  
# verwijderen van dubbels
def remove_duplicates(dir_path):

    """
    Verwijderen van duplicaten. Wanneer er een duplicaat aanwezig is komt er een (2) op het einde van de naam.
    Sommige files heten bv. untitles-1970 (2).jpg!Portrait.jpg, daardoor moeten we eerst bij alle files het stuk na het uitroepteken weglaten,
    dit gebeurd in de clean_paths functie.
    Wanneer er een duplicaat gevonden wordt, wordt deze verwijderd.

    Parameters:
        dataset_dir: A string containing the path to a directory containing
            subdirectories to different classes.
    """

    clean_paths(dir_path)

    for path in os.listdir(dir_path):
        # check if current path is a file
        if (os.path.isfile(os.path.join(dir_path, path)) and ("(2)." in path)):
            os.remove(f"{dir_path}/{path}")



def rename_files(dir_path):
    """"
    Hernoemen van de files naar de nummering per schilder

    Parameters:
        dataset_dir: A string containing the path to a directory containing
            subdirectories to different classes.
    """
    i = 0

    for path in os.listdir(dir_path):
        # check if current path is a file
        if os.path.isfile(os.path.join(dir_path, path)):
            os.rename(f"{dir_path}/{path}", f"{dir_path}/{i}.jpg")
            i += 1
        
           
# opkuisen path (bv. 1474.jpg!Portrait.jpg --> 1474.jpg)
def clean_paths(dir_path):
    """"
    Sommige files heten bv. untitles-1970 (2).jpg!Portrait.jpg, daardoor moeten we eerst bij alle files het stuk na het uitroepteken weglaten.

    Parameters:
        dataset_dir: A string containing the path to a directory containing
            subdirectories to different classes.
    """

    for path in os.listdir(dir_path):
        # check if current path is a file
        if (os.path.isfile(os.path.join(dir_path, path)) and ("!" in path)):
            
            new_name = path.split("!")[0]
                        
            if not os.path.isfile(os.path.join(dir_path, new_name)): # if file doesn't already exist
                os.rename(f"{dir_path}/{path}", f"{dir_path}/{new_name}")
            else:
                os.remove(f"{dir_path}/{path}") # if file with new name already exist, remove file with bad name
     


        
# data cleanup functies toepassen om iedere directory
for painter in ["Mondriaan", "Picasso", "Rubens"]:
    remove_duplicates(f"data/{painter}")
    #clean_paths(f"data/{painter}")
    rename_files(f"data/{painter}")

## Inlezen data

Data wordt ingelezen (en nog niet gepreprocessed). 

In [29]:
def create_data_with_labels(dataset_dir):
    
    image_paths_per_label = collect_paths_to_files(dataset_dir)

    images = []
    labels = []
    for label, image_paths in image_paths_per_label.items():
        for image_path in image_paths:

            # print(str(image_path))

            img = cv2.imread(str(image_path))

            if(img is not None):
                # print(f"{i} {str(image_path)} --> succes")
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                images.append(img)
                
                # print(label)
                labels.append(label)       
                      
            else:
                print(f"{str(image_path)} --> FAILED")
                
    # data = np.array([preprocess_image(image.astype(np.float32))
    #             for image in images])
    data = np.array(images)
    labels = np.array(labels)
        
    return data, labels

def collect_paths_to_files(dataset_dir):
    """Returns a dict with labels for each subdirectory of the given directory
    as keys and lists of the subdirectory's contents as values.

    Parameters:
        dataset_dir: A string containing the path to a directory containing
            subdirectories to different classes.
    Returns:
        image_paths_per_label: A dict with labels as keys and lists of file
        paths as values.
    """
    dataset_dir = Path(dataset_dir)
    painter_dirs = [f for f in sorted(os.listdir(dataset_dir)) if not f.startswith('.')]
    image_paths_per_label = {
        label: [
            dataset_dir / painter_dir / '{0}'.format(f)
            for f in os.listdir(dataset_dir / painter_dir) if not f.startswith('.')
        ]
        for label, painter_dir in enumerate(painter_dirs)
    }
    return image_paths_per_label

# def preprocess_image(image):
#     """Returns a preprocessed image.

#     Parameters:
#         image: A RGB image with pixel values in range [0, 255].
#     Returns
#         image: The preprocessed image.
#     """
#     image = image / 255.
#     image = cv2.resize(image, (100, 100))

#     return image


In [30]:
(data, labels) = create_data_with_labels("data/")

data\Picasso\145.jpg --> FAILED
data\Picasso\1480.jpg --> FAILED


  data = np.array(images)


## Data exploration

In [31]:
def check_amount_of_images(dir_path):
    count = 0
    for path in os.listdir(dir_path):
        # check if current path is a file
        if os.path.isfile(os.path.join(dir_path, path)):
            count += 1
    print(f'File count {dir_path}: {count}')

check_amount_of_images("data/Mondriaan")
check_amount_of_images("data/Picasso")
check_amount_of_images("data/Rubens")

File count data/Mondriaan: 321
File count data/Picasso: 1506
File count data/Rubens: 617
