# Data preprocessing
## Action Plan
1. Create Validation and Sample sets
2. Rearrange image files into their respective directories 

## Imports

In [1]:
%pwd

'/home/nathan/git/planktonDataScienceBowl/scripts'

In [2]:
#Create references to important directories we will use over and over
import os, sys
current_dir = os.getcwd()
SCRIPTS_HOME_DIR = current_dir
DATA_HOME_DIR = current_dir+'/data'

Using tensorflow with theano image ordering to match utils.py

In [3]:
from glob import glob
import numpy as np
import _pickle as pickle
import PIL
from PIL import Image
from PIL import ImageOps
from tqdm import tqdm
import bcolz

In [3]:
#import modules
from utils import *
from vgg16 import Vgg16

%matplotlib inline

Using gpu device 0: GeForce GTX 1070 (CNMeM is enabled with initial size: 85.0% of memory, cuDNN 5105)
Using Theano backend.


## Create validation set and sample

In [5]:
%ls

[0m[01;34mdata[0m/              plankton_model.ipynb        [01;34m__pycache__[0m/  [01;32mvgg16bn.py[0m*
moving_data.ipynb  plankton_model_vgg16.ipynb  [01;32mutils.py[0m*     [01;32mvgg16.py[0m*


In [6]:
#Create directories
%cd $DATA_HOME_DIR
%mkdir valid
%mkdir results
%mkdir -p sample/train
%mkdir -p sample/test
%mkdir -p sample/valid
%mkdir -p sample/results
%mkdir -p test/unknown

/home/nathan/git/planktonDataScienceBowl/scripts/data


### Train/Valid

In [7]:
%cd $DATA_HOME_DIR/train

/home/nathan/git/planktonDataScienceBowl/scripts/data/train


In [8]:
folders = ([name for name in os.listdir(".") if os.path.isdir(name)])

Create validation set

In [9]:
for folder in folders:
    os.chdir(DATA_HOME_DIR + '/train/' + folder)
    g = glob('*.jpg')
    shuf = np.random.permutation(g)
    try:
        os.mkdir(DATA_HOME_DIR+'/valid/'+ folder)
    except:
        pass
    for i in range(round(len(shuf)/10 + 1)): os.rename(shuf[i], DATA_HOME_DIR+'/valid/'+ folder + '/' + shuf[i])

In [10]:
# need to copy files - use copyfile
from shutil import copyfile

### Sample
Make sample training set

In [11]:
# g = glob('*.jpg')
# shuf = np.random.permutation(g)
# for i in range(200): copyfile(shuf[i], DATA_HOME_DIR+'/sample/train/' + shuf[i])

Next make sample validation set

In [12]:
# %cd $DATA_HOME_DIR/valid

In [13]:
# g = glob('*.jpg')
# shuf = np.random.permutation(g)
# for i in range(50): copyfile(shuf[i], DATA_HOME_DIR+'/sample/valid/' + shuf[i])

## Rearrange image files into their respective directories

Dogs and cats need to be in their respective folders

### Train/Valid

In [14]:
# %cd $DATA_HOME_DIR/valid
# %mkdir cats
# %mkdir dogs
# %mv cat.*.jpg cats/
# %mv dog.*.jpg dogs/

# %cd $DATA_HOME_DIR/train
# %mkdir cats
# %mkdir dogs
# %mv cat.*.jpg cats/
# %mv dog.*.jpg dogs/

### Sample

In [15]:
# %cd $DATA_HOME_DIR/sample/train
# %mkdir cats
# %mkdir dogs
# %mv cat.*.jpg cats/
# %mv dog.*.jpg dogs/

# %cd $DATA_HOME_DIR/sample/valid
# %mkdir cats
# %mkdir dogs
# %mv cat.*.jpg cats/
# %mv dog.*.jpg dogs/

### Test

In [16]:
# Create single 'unknown' class for test set
%cd $DATA_HOME_DIR/test
%mv *.jpg unknown/

/home/nathan/git/planktonDataScienceBowl/scripts/data/test


## Info on image size

In [17]:
def resize_save_image(image_path):
    # open image and get array
    image_array = np.asarray(Image.open(image_path))
    
    # save image size (not necessary but easier to think about)
    image_shape = image_array.shape
    
    # determine padding
    if image_shape[0] > image_shape[1]:
        padH = int((image_shape[0] - image_shape[1]) / 2)
        padV = 0

    if image_shape[1] > image_shape[0]:
        padH = 0
        padV = int((image_shape[1] - image_shape[0]) / 2)

    if image_shape[1] == image_shape[0]:
        padH = 0
        padV = 0

    # apply padding
    padded_array = np.pad(image_array, ((padV,padV),(padH,padH)), 
                          mode='constant', constant_values=255)
    
    # convert back to image
    img = Image.fromarray(padded_array, 'L')
    
    # resize image to square (max 1 pixel resize)
    square = (np.max(image_array.shape))
    img = img.resize((square,square))
    
    # invert colors
#     img = ImageOps.invert(img)
    
    # save image
    img.save(image_path)

### Train

In [18]:
%cd $DATA_HOME_DIR/train
folders = ([name for name in os.listdir(".") if os.path.isdir(name)])

/home/nathan/git/planktonDataScienceBowl/scripts/data/train


In [19]:
for folder in folders:
    os.chdir(DATA_HOME_DIR + '/train/' + folder)
    g = glob('*.jpg')
    for image_path in g:
        resize_save_image(image_path)

### Valid

In [4]:
%cd $DATA_HOME_DIR/valid
folders = ([name for name in os.listdir(".") if os.path.isdir(name)])

/home/nathan/git/planktonDataScienceBowl/scripts/data/valid


In [21]:
for folder in folders:
    os.chdir(DATA_HOME_DIR + '/valid/' + folder)
    g = glob('*.jpg')
    for image_path in g:
        resize_save_image(image_path)

### Test

In [22]:
%cd $DATA_HOME_DIR/test
folders = ([name for name in os.listdir(".") if os.path.isdir(name)])

/home/nathan/git/planktonDataScienceBowl/scripts/data/test


In [23]:
for folder in folders:
    os.chdir(DATA_HOME_DIR + '/test/' + folder)
    g = glob('*.jpg')
    for image_path in g:
        resize_save_image(image_path)

## Mean of data

In [5]:
%cd $DATA_HOME_DIR/train
folders = ([name for name in os.listdir(".") if os.path.isdir(name)])

/home/nathan/git/planktonDataScienceBowl/scripts/data/train


In [6]:
X_train = None
for folder in tqdm(folders):
    os.chdir(DATA_HOME_DIR + '/train/' + folder)
    g = glob('*.jpg')
    for image_path in g:
        image_array = np.asarray(Image.open(image_path).resize((128,128)))
        try:
            X_train = np.concatenate((X_train, image_array), axis=0)
        except:
            print ("failed to concatenate")
            X_train = image_array

  0%|          | 0/121 [00:00<?, ?it/s]

failed to concatenate


100%|██████████| 121/121 [16:58<00:00,  9.57s/it]


In [9]:
%cd $DATA_HOME_DIR/valid
folders = ([name for name in os.listdir(".") if os.path.isdir(name)])

/home/nathan/git/planktonDataScienceBowl/scripts/data/valid


In [21]:
X_train = X_train.reshape(33487, 128, 128)[:27184].reshape(3479552, 128)

In [22]:
for folder in tqdm(folders):
    os.chdir(DATA_HOME_DIR + '/valid/' + folder)
    g = glob('*.jpg')
    for image_path in g:
        image_array = np.asarray(Image.open(image_path).resize((128,128)))
        X_train = np.concatenate((X_train, image_array), axis=0)


  0%|          | 0/121 [00:00<?, ?it/s][A
  1%|          | 1/121 [00:01<02:27,  1.23s/it][A
100%|██████████| 121/121 [03:49<00:00,  1.17s/it]


In [23]:
imgs = int(X_train.shape[0]/128)

In [24]:
X_train2 = X_train.reshape(imgs, 1, 128, 128)

In [25]:
X_train2.mean().astype(np.float32)

11.868742

In [26]:
X_train2.std().astype(np.float32)

38.339569

In [31]:
X_train3 = X_train2/255

In [33]:
X_train3.mean().astype(np.float32)

0.046544086

In [34]:
X_train3.std().astype(np.float32)

0.15035126