# 2 Data processing
In this section of the notebook we will get the original training dataset provieded by the Happy Wale and we will transform, and reshape it to our needs.
Steps that will be performed in this script
- Definition of usefull functions
-- Data organization
-- Image augmentation
-- Image RGB to grey scale transformation
-- Image scaling
-- Pickle data

We will start with the usual include section:

In [None]:
#import math
#from collections import Counter

#import matplotlib.pyplot as plt
#import pandas as pd
#import numpy as np
#from PIL import Image
#import os
#import sys
#import shutil
#import random
#import imageio

#from six.moves import cPickle as pickle

#from tqdm import tqdm

%matplotlib inline

# Definition of useful functions

In [None]:
def copy_files_to_dir(nfiles, source_dir, destination_dir, maximum=0):
    """
    Description: copy a given set of files from a source directory to a destination
    directory, optionally you can also set a maximum number of images to be copied
    -> nfiles: list of files names
    -> source_dir: path of the directory where the files will be copied from
    -> destination_dir: path of the directory where the files will be copied
    -> maximum: maximum number of files to be copied
    """
    for index, nfile in enumerate(nfiles):
            full_file_path = os.path.join(source_dir, nfile)
            if (os.path.isfile(full_file_path)):
                shutil.copy(full_file_path, destination_dir)
            if (maximum != 0) and index >= maximum-1:
                return
            
def copy_images_to_dir_expand(images, source_dir, destination_dir, maximum=0, minimum=0):
    """
    Description: copy a given set of images from a source directory to a destination
    directory, optionally you can also set a maximum number of images to be copied and
    a minimum images to be copied. In case that the provided list of images is shorter than
    the speficied minimum the images will be augmented.
    -> images: list of files images
    -> source_dir: path of the directory where the files will be copied from
    -> destination_dir: path of the directory where the files will be copied
    -> maximum: maximum number of files to be copied
    -> minimum: minimum number of files to be copied
    """
    copy_files_to_dir(images, source_dir, destination_dir, maximum)
    if minimum != 0 and len(images) < minimum:
        for index in range(1,minimum - len(images)):
            image_ex = load_img(os.path.join(source_dir, random.choice(images)))
            x = img_to_array(image_ex)
            x = x.reshape((1,) + x.shape) 
            i = 0
            for batch in datagen.flow(x, batch_size=1,
                          save_to_dir=destination_dir, save_prefix='exp_', save_format='jpeg'):
                break  # otherwise the generator would loop indefinitely
                    
def generate_file_structure(root_path, source_dir=None, maximum=0, minimum=0):
    """
    Description: given a root path it generates a tree folder structure for all
    the image ids provieded as input
    -> root_path: list of files names
    -> source_dir: path of the directory where the files will be copied from
    include this path if you want to fill the generated tree with images (optional)
    -> maximum: maximum number of files to be copied (optional)
    -> minimum: minimum number of files to be copied (optional)
    """
    progress = 0
    print('Progress '+str(progress)+' %')
    if not os.path.exists(root_path):
        os.makedirs(root_path)
    # Creating a folder for each whale id
    #New folder names set
    folder_ids = set(train_df['Id'].values.tolist())
    for index, folder_id in enumerate(folder_ids):
        new_path_folder = os.path.join(root_path, folder_id)
        if not os.path.exists(new_path_folder):
            os.makedirs(new_path_folder)
        if source_dir is not None:
            images = train_df[train_df['Id'] == folder_id]['Image'].values.tolist()
            copy_images_to_dir_expand(images, source_dir, new_path_folder, maximum, minimum)
        if(round(100*float(index)/float(len(folder_ids))) > progress):
            print('Progress '+str(round(100*float(index)/float(len(folder_ids))))+' %')
        progress = round(100*float(index)/float(len(folder_ids)))