In [1]:
import numpy as np 
import pandas as pd 
import os
import matplotlib.image as img
from PIL import Image
import collections
from keras.utils import to_categorical
from random import randint
%matplotlib inline
import matplotlib.pyplot as plt
import pickle
from sklearn.utils import shuffle

Using TensorFlow backend.


In [4]:
# paths for training and test data
trainPath = "input/seg_train/"
testPath = "input/seg_test/"

In [5]:
# define a function to load images
def load_image(infilename):
    image = Image.open(infilename)
    image.load()
    return image

In [6]:
# define function to read in images and labels
def read_images(folder):
    
    subFolders = os.listdir(folder)
    
    images = []
    labels = []
    
    # read in images, convert to arrays, resize if necessary and append labels
    for subFolder in subFolders:
        
        subFolderPath = folder + subFolder + "/"
        print("Reading in images from " + subFolderPath)
        
        for file in os.listdir(subFolderPath):
            if file.endswith("jpg"):
                # read in image
                image = load_image(subFolderPath + file)

                # append image
                images.append(image)

                # append labels
                labels.append(subFolder)
    
    # Checks
    print("Number of images = " + str(len(images)))
    print("Number of labels = " + str(len(labels)))
    print("Label frequencies: " + str(collections.Counter(labels)))
    
    # return as arrays
    return images, labels

In [7]:
# Read in training data
images_train, labels_train = read_images(trainPath)

Reading in images from input/seg_train/buildings/
Reading in images from input/seg_train/forest/
Reading in images from input/seg_train/glacier/
Reading in images from input/seg_train/mountain/
Reading in images from input/seg_train/sea/
Reading in images from input/seg_train/street/
Number of images = 14034
Number of labels = 14034
Label frequencies: Counter({'mountain': 2512, 'glacier': 2404, 'street': 2382, 'sea': 2274, 'forest': 2271, 'buildings': 2191})


In [8]:
# Read in test data
images_test, labels_test = read_images(testPath)

Reading in images from input/seg_test/buildings/
Reading in images from input/seg_test/forest/
Reading in images from input/seg_test/glacier/
Reading in images from input/seg_test/mountain/
Reading in images from input/seg_test/sea/
Reading in images from input/seg_test/street/
Number of images = 3000
Number of labels = 3000
Label frequencies: Counter({'glacier': 553, 'mountain': 525, 'sea': 510, 'street': 501, 'forest': 474, 'buildings': 437})


In [10]:
# Shuffle training datasets
images_train, labels_train = shuffle(images_train, labels_train)

In [11]:
# Take 3000 images from training for validation
images_valid = images_train[-3000:]
labels_valid = labels_train[-3000:]
print("Number of images = " + str(len(images_valid)))
print("Number of labels = " + str(len(labels_valid)))
print("Label frequencies: " + str(collections.Counter(labels_valid)))

Number of images = 3000
Number of labels = 3000
Label frequencies: Counter({'mountain': 538, 'glacier': 537, 'sea': 498, 'forest': 478, 'street': 475, 'buildings': 474})


In [12]:
# Remove validation images from training dataset
images_train = images_train[:-3000]
labels_train = labels_train[:-3000]
print("Number of images = " + str(len(images_train)))
print("Number of labels = " + str(len(labels_train)))
print("Label frequencies: " + str(collections.Counter(labels_train)))

Number of images = 11034
Number of labels = 11034
Label frequencies: Counter({'mountain': 1974, 'street': 1907, 'glacier': 1867, 'forest': 1793, 'sea': 1776, 'buildings': 1717})


In [13]:
# define function to convert images to arrays, resizing if necessary
def convertImages(images):
    
    X_data = []
    
    for image in images:
        
        # convert to array
        x = np.asarray(image, dtype="int32")
            
        # resize if necessary
        if x.shape != (150, 150, 3):
            resized = image.resize((150, 150), Image.LANCZOS)
            x = np.asarray(resized, dtype="int32")
            
        # append data
        X_data.append(x)
    
    return np.array(X_data)

In [14]:
X_train = convertImages(images_train)
X_test = convertImages(images_test)
X_valid = convertImages(images_valid)

In [15]:
# Create dictionary which maps class to an integer label
classDict = dict((label, counter) for counter, label in enumerate(list(set(labels_train))))
print(classDict)

{'glacier': 0, 'mountain': 1, 'sea': 2, 'forest': 3, 'street': 4, 'buildings': 5}


In [16]:
# Get number of classes
classes = len(classDict)
classes

6

In [17]:
# Define function to one-hot encode labels
def oneHot(labels, classDict, classes):
    
    # convert labels to integer values
    values = []
    for label in labels:
        values.append(classDict[label])
    
    # one-hot encode label values    
    onehot = to_categorical(values, classes)
    
    return np.array(onehot)

In [18]:
y_train = oneHot(labels_train, classDict, classes)
y_test = oneHot(labels_test, classDict, classes)
y_valid = oneHot(labels_valid, classDict, classes)

In [19]:
# Define pickle function so that data can be saved and imported into next stage
def saveData(data, filename):
    with open(filename, 'wb') as f:
        pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)

In [20]:
# saveData
saveData(X_train[:5000], 'X_train1.pickle')
saveData(X_train[5000:], 'X_train2.pickle')
saveData(y_train, 'y_train.pickle')
saveData(images_train, 'images_train.pickle')
saveData(labels_train, 'labels_train.pickle')
saveData(X_test, 'X_test.pickle')
saveData(y_test, 'y_test.pickle')
saveData(images_test, 'images_test.pickle')
saveData(labels_test, 'labels_test.pickle')
saveData(X_valid, 'X_valid.pickle')
saveData(y_valid, 'y_valid.pickle')
saveData(images_valid, 'images_valid.pickle')
saveData(labels_valid, 'labels_valid.pickle')

In [21]:
!dir *.pickle

 Volume in drive C is Local Disk
 Volume Serial Number is 0671-FAEA

 Directory of C:\Users\mattc\Coursera\ibm-ads-capstone

17/03/2019  20:07       202,558,540 images_test.pickle
17/03/2019  20:06       744,997,171 images_train.pickle
17/03/2019  20:07       202,605,340 images_valid.pickle
17/03/2019  20:07             6,065 labels_test.pickle
17/03/2019  20:06            22,151 labels_train.pickle
17/03/2019  20:07             6,065 labels_valid.pickle
17/03/2019  20:06       810,000,160 X_test.pickle
17/03/2019  20:06     1,350,000,160 X_train1.pickle
17/03/2019  20:06     1,629,180,160 X_train2.pickle
17/03/2019  20:07       810,000,160 X_valid.pickle
17/03/2019  20:06            72,155 y_test.pickle
17/03/2019  20:06           264,971 y_train.pickle
17/03/2019  20:07            72,155 y_valid.pickle
              13 File(s)  5,749,785,253 bytes
               0 Dir(s)  21,017,133,056 bytes free
