In [1]:
import numpy as np
import matplotlib.pyplot as plt
import cv2
import os
import random
import pickle

In [3]:
import random
from keras.preprocessing.image import ImageDataGenerator

#This will be the path for where Stanford dog iages is held
dir_path = '/Users/suare/Desktop/dogs/images1/Images'
categories = os.listdir(dir_path)
# gets the class indexes
class_to_index = {category: index for index, category in enumerate(categories)}
training_data = []
# ensure our image size
img_size = 299

# Augmentations
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    brightness_range=(0.8, 1.2),  
    contrast_range=(0.8, 1.2),    
    rescale=1./255                
)

# default is 1 augmentation
def create_training_data(num_augmentations=1):
    for category in categories:
        path = os.path.join(dir_path, category)
        class_num = class_to_index[category]

        print(f"Processing category: {category}, path: {path}")

        for img in os.listdir(path):
            try:
                img_array = cv2.imread(os.path.join(path, img))[...,::-1]
                img_array = cv2.resize(img_array, (img_size, img_size))
                
                # Reshape to (1, img_size, img_size, 3) to fit datagen.flow
                img_array = img_array.reshape((1,) + img_array.shape)
                
                # Apply data augmentation
                for batch in datagen.flow(img_array, batch_size=1):
                    augmented_image = batch[0]
                    training_data.append([augmented_image, class_num])
                    if len(training_data) % num_augmentations == 0:
                        break  # Exit the loop after generating the desired number of augmentations

            except Exception as e:
                print(f"Error processing image {img} in category {category}: {str(e)}")

# Create training data with data augmentation
# able to select number of augmentations
create_training_data(num_augmentations=2)

# Shuffle the training data
random.shuffle(training_data)



Processing category: n02085620-Chihuahua, path: /Users/suare/Desktop/dogs/images1/Images\n02085620-Chihuahua
Processing category: n02085782-Japanese_spaniel, path: /Users/suare/Desktop/dogs/images1/Images\n02085782-Japanese_spaniel
Processing category: n02085936-Maltese_dog, path: /Users/suare/Desktop/dogs/images1/Images\n02085936-Maltese_dog
Processing category: n02086079-Pekinese, path: /Users/suare/Desktop/dogs/images1/Images\n02086079-Pekinese
Processing category: n02086240-Shih-Tzu, path: /Users/suare/Desktop/dogs/images1/Images\n02086240-Shih-Tzu
Processing category: n02086646-Blenheim_spaniel, path: /Users/suare/Desktop/dogs/images1/Images\n02086646-Blenheim_spaniel
Processing category: n02086910-papillon, path: /Users/suare/Desktop/dogs/images1/Images\n02086910-papillon
Processing category: n02087046-toy_terrier, path: /Users/suare/Desktop/dogs/images1/Images\n02087046-toy_terrier
Processing category: n02087394-Rhodesian_ridgeback, path: /Users/suare/Desktop/dogs/images1/Image

In [4]:
print(len(training_data))

41160


In [3]:
dir_path = "/Users/suare/Desktop/dogs"
categories = []

for root, dirs, files in os.walk(dir_path):
    for dir_name in dirs:
        categories.append(dir_name)
categories

['images1',
 'Images',
 'n02085620-Chihuahua',
 'n02085782-Japanese_spaniel',
 'n02085936-Maltese_dog',
 'n02086079-Pekinese',
 'n02086240-Shih-Tzu',
 'n02086646-Blenheim_spaniel',
 'n02086910-papillon',
 'n02087046-toy_terrier',
 'n02087394-Rhodesian_ridgeback',
 'n02088094-Afghan_hound',
 'n02088238-basset',
 'n02088364-beagle',
 'n02088466-bloodhound',
 'n02088632-bluetick',
 'n02089078-black-and-tan_coonhound',
 'n02089867-Walker_hound',
 'n02089973-English_foxhound',
 'n02090379-redbone',
 'n02090622-borzoi',
 'n02090721-Irish_wolfhound',
 'n02091032-Italian_greyhound',
 'n02091134-whippet',
 'n02091244-Ibizan_hound',
 'n02091467-Norwegian_elkhound',
 'n02091635-otterhound',
 'n02091831-Saluki',
 'n02092002-Scottish_deerhound',
 'n02092339-Weimaraner',
 'n02093256-Staffordshire_bullterrier',
 'n02093428-American_Staffordshire_terrier',
 'n02093647-Bedlington_terrier',
 'n02093754-Border_terrier',
 'n02093859-Kerry_blue_terrier',
 'n02093991-Irish_terrier',
 'n02094114-Norfolk_terr

In [8]:
# Data is too big so batches have to be made
# each batch file will hold 2000 
batch_size = 2000
num_batches = len(training_data) // batch_size + 1
# go through training_data and make batch files based on index
for batch_index in range(num_batches):
    start = batch_index * batch_size
    end = (batch_index + 1) * batch_size
    batch_data = training_data[start:end]
    #
    X_batch = np.array([item[0] for item in batch_data]).reshape(-1, img_size, img_size, 3)
    y_batch = np.array([item[1] for item in batch_data])

    # Save the batch data
    pickle_out = open(f"X_batch_{batch_index}.pickle", "wb")
    pickle.dump(X_batch, pickle_out)
    pickle_out.close()

    pickle_out = open(f"y_batch_{batch_index}.pickle", "wb")
    pickle.dump(y_batch, pickle_out)
    pickle_out.close()