# Creating Test and Training Data

The following code will create a .npz file
This File contains a specific number of training and test images. 
We will later pass this data in our neural network.

In [1]:
import numpy as np
import os
from scipy.misc import imread, imresize
import matplotlib.pyplot as plt
  
print ("Package loaded") 
cwd = os.getcwd()
print ("Current folder is %s" % (cwd) )

Package loaded
Current folder is C:\Users\nur20\Documents\GitHub\TensorFlow-Master


In [2]:
#Set Path for training and Testdata! 
paths = {"img_dataset/Bush", "img_dataset/Arnold"}

# Set the imagesize of the training and testting data
imgsize = [64, 64]

# Training and Testingdata will be grayscaled
use_gray = 1

# Set Name of the saved training/ testing data file
data_name = "custom_data"

print ("Your images should be at")
for i, path in enumerate(paths):
    # Your image should be at [1/i(Anzahl der Paths)] /cwd(home/data/Facerecognition)/path(z.B. img_dataset/Ben) 
    print (" [%d/%d] %s/%s" % (i+1, len(paths), cwd, path)) 

Your images should be at
 [1/2] C:\Users\nur20\Documents\GitHub\TensorFlow-Master/img_dataset/Arnold
 [2/2] C:\Users\nur20\Documents\GitHub\TensorFlow-Master/img_dataset/Bush


In [3]:
# Shows the path where the .npz file (Training and Testing data) will be stored
print ("Data will be saved to %s"% (cwd + '/data/' + data_name + '.npz'))

# Function, that turnes the image to a gray image
def rgb2gray(rgb):
    # If the length of the images shape is 3 
    if len(rgb.shape) is 3:
        return np.dot(rgb[...,:3], [0.299, 0.587, 0.114])
    else:
        # print ("Current Image if GRAY!")
        return rgb

# Nr of classes (image sets)
nclass     = len(paths)
valid_exts = [".jpg",".gif",".png",".tga", ".jpeg"]
imgcnt     = 0

# i counts through the number of classes (Not inside of the folders)
for i, relpath in zip(range(nclass), paths):
    path = cwd + "/" + relpath

    # A list of all file names inside of (path = imagefolder) will be stored in "flist"
    flist = os.listdir(path)
    
    # f counts through the list of images inside of class folder (f = George_W_Bush_0001.jpg, ...)
    for f in flist:

        # check if image has the right format
        if os.path.splitext(f)[1].lower() not in valid_exts:
            continue
            
        # add f as a variable to the path (f = image full name, George_W_Bush_0001...)
        fullpath = os.path.join(path, f)

        # Read image 
        currimg  = imread(fullpath)
        
        # Convert to grayscale  
        if use_gray:
            
            # Call rgb2gray function
            grayimg  = rgb2gray(currimg)
        else:
            
            # If its already gray, do nothing
            grayimg  = currimg
            
        # resize the image (image,new width, new heidth)/255
        graysmall = imresize(grayimg, [imgsize[0], imgsize[1]])/255. 

        # reshape image to 1 x 4096 vector
        grayvec   = np.reshape(graysmall, (1, -1))
        
        # Save -  curr_label is a vector [1. 0. 0.] then [0. 1. 0.] then [0. 0. 1.] and so on
        # np.eye returns a 2-D array with ones on the diagonal and zeros elsewhere
        # Here it is just a [1xn] array and we use it for labeling the images
        curr_label = np.eye(nclass, nclass)[i:i+1, :]

        if imgcnt is 0:
            totalimg   = grayvec
            totallabel = curr_label
        else:
            
            # concatenate -> Join a sequence of arrays along an existing axis.
            # totalimg contains now all all images as a long vector [923 x 4096]
            totalimg   = np.concatenate((totalimg, grayvec), axis=0)
            totallabel = np.concatenate((totallabel, curr_label), axis=0)
        
        # Next image
        imgcnt    = imgcnt + 1
print ("Total %d images loaded." % (imgcnt))

Data will be saved to C:\Users\nur20\Documents\GitHub\TensorFlow-Master/data/custom_data.npz
Total 224 images loaded.


In [4]:
def print_shape(string, x):
    print ("Shape of '%s' is %s" % (string, x.shape,))

# creates an array with values from zero to imgcnt and with the lengt: imgcnt
randidx    = np.random.randint(imgcnt, size=imgcnt)

# save 3/5 of the array randidx in trainidx
trainidx   = randidx[0:int(3*imgcnt/5)]

# save the rest (2/5) of the random array in testidx
testidx    = randidx[int(3*imgcnt/5):imgcnt]

# use the above defined arrays and devide the image/label list array in test and training set (3/5 - 2/5)
trainimg   = totalimg[trainidx, :]
trainlabel = totallabel[trainidx, :]
testimg    = totalimg[testidx, :]
testlabel  = totallabel[testidx, :]
print_shape("trainimg", trainimg)
print_shape("trainlabel", trainlabel)
print_shape("testimg", testimg)
print_shape("testlabel", testlabel)

# Save all data in an .npz (zip file)
# passed data
# - training images
# - training labels
# - test images
# - test labels
# - Nr. of images
# - gray images or not
savepath = cwd + "/data/" + data_name + ".npz"
np.savez(savepath, trainimg=trainimg, trainlabel=trainlabel
         , testimg=testimg, testlabel=testlabel, imgsize=imgsize, use_gray=use_gray)
print ("Saved to %s" % (savepath))

Shape of 'trainimg' is (134, 4096)
Shape of 'trainlabel' is (134, 2)
Shape of 'testimg' is (90, 4096)
Shape of 'testlabel' is (90, 2)
Saved to C:\Users\nur20\Documents\GitHub\TensorFlow-Master/data/custom_data.npz
