# data_generator
This repository contains the code for generating the dataset in npz format for the Capstone project.

### Instructions
1. Create folders for each class in the same directory, rename each folder as its label name. (It's what annotation_parser can do)
2. Put images to each folder according to its label.
3. set scale_percent(resizing image to scale_percent% of original image, unit:%, default = 100%) 
4. write down all labels (all folder's name) in array class_name
5. set sliding window size to the correct width of the image, note that length must be equal to width.
6. Adjust path, image format and export file name

### Reference
1. numpy save data: https://numpy.org/doc/stable/reference/generated/numpy.savez.html
2. Iris datset training sample in Bunch format. https://scikit-learn.org/stable/auto_examples/svm/plot_iris_svc.html

In [None]:
from PIL import Image
import matplotlib.pyplot as plt
from matplotlib import image as im
import numpy as np
import os
from sklearn.utils import Bunch

In [None]:
dataset_name = '_datasetC.npz'
scale_percent = 25 # percent of original size
class_name = ['top','middle','bottom']
num_image = 23  # number of classes (folders' name is from 0 to 22)
sliding_window_size = 200

In [108]:
image_count = []
total_image_count = 0
for folder in class_name:
    count = 0
    for image_index in range(num_image):
        path = f'cropped/{folder}/200_100/{image_index:03}/' # adjust to the right path of your dataset
        if not os.path.exists(path):
            continue
        
        for file_name in os.listdir(path):
            if file_name.endswith(".jpg"):  # adjust to the right format of your dataset
                count += 1
    image_count.append(count)
    total_image_count += count

# sliding window parameters set up
data = np.zeros([total_image_count, int(sliding_window_size*scale_percent*0.01), int(sliding_window_size*scale_percent*0.01)])
labels = np.zeros(total_image_count)
count = 0
for label, folder in enumerate(class_name):
    images = []
    for image_index in range(num_image):
        path = f'cropped/{folder}/200_100/{image_index:03}/' # adjust to the right path of your dataset
        if not os.path.exists(path):
            continue

        for file_name in os.listdir(path):
            if file_name.endswith(".jpg"): # adjust to the right format of your dataset
                image = im.imread(f'{path}{file_name}')

                #trans into grayscale
                rgb_weights = [0.2989, 0.5870, 0.1140]
                image = np.dot(image[...,:3], rgb_weights)

                images.append(resizeImage(image, scale_percent))
    data[count:count + image_count[label]] = np.array(images)
    labels[count:count + image_count[label]] = np.ones(len(images)) * label
    count += image_count[label]
    
    data = data.astype(np.uint8)
    labels = labels.astype(np.uint8)

np.savez(str(total_image_count)+ dataset_name, data=data, label=labels) # adjust to the right name of desired dataset name

In [101]:
import cv2
def resizeImage(image, scale_percent):
    width = int(image.shape[1] * scale_percent / 100)
    height = int(image.shape[0] * scale_percent / 100)
    dim = (width, height)
    # resize image
    resized = cv2.resize(image, dim, interpolation = cv2.INTER_AREA)
    return resized