In [None]:
"""
Gather local data and shuffle them for training purpose

You might change the kaggle folder name in DATADIR path to suit your case.
"""

import numpy as np # Array of images
import matplotlib.pyplot as plt
import os # Iterate through directories and join paths
import cv2 # Image operations


# Get your current working directory
DIR = os.getcwd()
print(DIR)

# Change directory to PetImages 
DATADIR = os.path.join(DIR, "kagglecatsanddogs_3367a", "PetImages")

# Categories to classify
CATEGORIES = ["Dog", "Cat"]


"1. Convert all the image examples to an array then convert to grey scale if essential"

# Iterate through all the examples of dogs and cats
for category in CATEGORIES:
    path = os.path.join(DATADIR, category) # path to cats or dogs dir
    # iterate through a bunch of those images with names by just numbers
    for img in os.listdir(path):
        # convert image to an array using cv2.imread()
        # os.path.join(path,img) gives full path to that image
        # then convert to grey scale using IMREAD_GRAYSCALE 
        # (RGB data = 3 times size data of grey scale data)
        # In a lot of identifying tasks, color is essential but not this task
        img_array = cv2.imread(os.path.join(path,img), cv2.IMREAD_GRAYSCALE)
        
        #to make sure it is what we expect, we can see the picture by using
        
        plt.imshow(img_array, cmap = "gray")
        plt.show()
        
        break
    break

In [None]:
"2. Make everything the same shape 50x50 , decide the size u wanna go with"
IMG_SIZE = 50
new_array = cv2.resize(img_array, (IMG_SIZE, IMG_SIZE))
plt.imshow(new_array, cmap = 'gray')
plt.show()

In [None]:
"3. Create training_dataset which has categories in number and broken images are got rid of"
training_data = []
def create_training_data():
    for category in CATEGORIES:
        path = os.path.join(DATADIR, category) # path to cats or dogs dir

        # we got the features as number, but our labels, classifications are not yet number
        # so we need to convert string "dog" ,"cat" to numerical
        class_num = CATEGORIES.index(category)

        # iterate through a bunch of those images with names by just numbers
        for img in os.listdir(path):
            # some images might be broken so we use try-except
            try:
                img_array = cv2.imread(os.path.join(path,img), cv2.IMREAD_GRAYSCALE)
                new_array = cv2.resize(img_array, (IMG_SIZE, IMG_SIZE))

                # append new array and classifications to the training_dataset
                training_data.append([new_array, class_num])
            except Exception as e:
                pass

# Training dataset should be balanced 50/50 for dog/cat dataset so model will not have hardtime learning
create_training_data()


In [None]:
# Length of training data
print(len(training_data))

In [None]:
"4. Shuffle the data so the neuron network can learn better"
import random
random.shuffle(training_data)   # training_data is a list, mutable


In [None]:
"5. Pack to variables we use to feed neural network"
X = []
y = []
for features, lable in training_data:
    # X and y are the lists, but we cannot pass a list to a neuron network
    X.append(features)
    y.append(lable)
    
    # So y can be a list, but X has to be a numpy array and then we reshape it
    # -1 is how many features we have, then IMG_SIZE, then 1 because it is a grayscale (3 for RGB)
X = np.array(X).reshape(-1, IMG_SIZE, IMG_SIZE, 1)     

In [None]:
"6. Save data using pickle"
# Save ur data so u dont have to redo it every time
import pickle #or numpy

# Create pickle files to save data
pickle_out = open("X.pickle", "wb")
pickle.dump(X, pickle_out)
pickle_out.close()

pickle_out = open("y.pickle", "wb")
pickle.dump(y, pickle_out)
pickle_out.close()