In [17]:
!ln -s /Users/giakhang/Downloads/indoor_CVPR_dataset .

In [28]:
import numpy as np
import progressbar
import imutils
import random
import cv2
import os
from pathlib import Path

In [42]:
def create_dataset():
    DATASET_BASE = Path('indoor_CVPR_dataset')

    # 1. Get file paths
    image_paths = []
    for root, dirs, files in os.walk(DATASET_BASE / 'images'):
        for file in files:
            image_paths.append(os.path.join(root, file))

    # 2. Randomly select 10000 images
    random.shuffle(image_paths)
    image_paths = image_paths[:10000]

    # 3. Rotate images
    # {rotated_angle: num_of_imgs, ...}
    angles = {0: 0, 
              90: 0, 
              180: 0, 
              270: 0}

    # Start processbar
    bar = progressbar.ProgressBar(maxval=len(image_paths))
    bar.start()

    for i, image_path in enumerate(image_paths):
        image = cv2.imread(image_path)
        angle = np.random.choice(list(angles.keys()))

        if image is None:
            continue

        # Rotate image
        image = imutils.rotate_bound(image, angle)

        # Output path
        base = os.path.sep.join([str(DATASET_BASE / 'rotated_images'), str(angle)])
        if not os.path.exists(base):
            os.makedirs(base)

        index = image_path.rfind(".")
        ext = image_path[index:]
        output_path = [base, "image_{}{}".format(str(angles.get(angle, 0)).zfill(5), ext)]
        output_path = os.path.sep.join(output_path)

        # Write file
        cv2.imwrite(output_path, image)

        angles[angle] = angles.get(angle, 0) + 1
        bar.update(i)

    bar.finish()

    for angle in list(angles.keys()):
        print('Số lượng ảnh góc {}: {}'.format(angle, angles[angle]))

In [43]:
create_dataset()

100% |#########################################################################|

Số lượng ảnh góc 0: 2427
Số lượng ảnh góc 90: 2525
Số lượng ảnh góc 180: 2491
Số lượng ảnh góc 270: 2536





In [3]:
# import the necessary packages
import h5py
import os


class HDF5DatasetWriter:
    def __init__(self, dims, outputPath, dataKey='images', bufSize=1000):
        '''
        check to see if the output path exists, and if so, raise an exception
        '''
        if os.path.exists(outputPath):
            raise ValueError("The supplied 'output path' already exists and cannot be overwritten. Manually delete the file before contuinuing", outputPath)

        # open the HDF5 database for writing and create two datasets: one to store the images/feature and another to store class labels
        self.db = h5py.File(outputPath, 'w')
        self.data = self.db.create_dataset('labels', (dims[0],), int)

        # store the buffer size, then initizlize the buffer itself along with the index into the database
        self.bufSize = bufSize
        self.buffer = {'data': [], 'labels': []}
        self.idx = 0

    def add(self, rows, labels):
        '''
        add the rows and labels to the buffer
        '''
        self.buffer['data'].extend(rows)
        self.buffer['labels'].extend(labels)

        # check to see if the buffer needs to be flused to disk
        if len(self.buffer['data']) >= self.bufSize:
            self.flush()

    def flush(self):
        '''
        write the buffers to disk then reset the buffer
        '''
        i = self.idx + len(self.buffer['data'])
        self.data[self.idx:i] = self.buffer['data']
        self.labels[self.idx:i] = self.buffer['labels']
        self.idx = i
        self.buffer = {'data': [], 'labels': []}

    def storeClassLabels(self, classLabels):
        '''
        create a dataset to store the actual class label names, then store the class labels
        '''
        dt = h5py.special_dtype(vlen=str)
        labelSet = self.db.create_dataset('label_names', (len(classLabels),), dtype=dt)
        labelSet[:] = classLabels

    def close(self):
        '''
        check to see if ther is any other entries in the buffer that need to be flushed to the disk
        '''
        if len(self.buffer['data']) > 0:
            self.flush()

        # close the dataset
        self.db.close()

In [43]:
from sklearn.preprocessing import LabelEncoder
from imutils import paths
import numpy as np
import progressbar
import random
import os
from pathlib import Path

def feature_extraction():
    DATASET_BASE = Path('indoor_CVPR_dataset')

    # 1. Get rotated images' paths and their labels
    image_paths, labels = [], []
    rotated_images = DATASET_BASE / 'rotated_images'

    for root, dirs, files in os.walk(rotated_images):
        for file in files:
            if file[-3:] in ['jpg', 'png', 'jpeg']:
                image_paths.append(os.path.join(root, file))

                # Chỉ dùng cho Linux hoặc MacOS, 
                labels.append(int(root.split('/')[-1]))
                
    # 2. Encode label to number
    le = LabelEncoder()
    labels = le.fit_transform(labels)

    # 3. Load VGG16
    model = ...

    # 4. Create HDF5 database
    hdf5_file = DATASET_BASE / 'hdf5' / 'features.hdf5'
    dataset = HDF5DatasetWriter((len(imagePaths), 512 * 7 * 7), hdf5_file, dataKey="features", bufSize=1000)

    # 5. Save label names to datset
    dataset.storeClassLabels(le.classes_)

    bar = progressbar.ProgressBar(maxval=len(imagePaths))
    bar.start()

    # 6. Iterate over batches
    bs = 32
    for i in np.arange(0, len(image_paths), bs):
        # Get image path and its label
        batch_paths = image_paths[i:i + bs]
        batch_labels = labels[i:i + bs]

        # Batch images
        batch_images = []
        for j, image_path in enumerate(batch_paths):
            # Load image, target_size = (224, 224)
            image = load_img(image_path)
            image = img_to_array(image)

            # Preprocess image
            image = np.expand_dims(image, 0)
            image = imagenet_utils.preprocess_input(image)

            # Add to batches
            batch_images.append(image)

        batch_images = np.vstack(batch_images)
        features = model.predict(batch_images)

        

In [44]:
feature_extraction()

[90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90, 90]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1]
[  0  90 180 270]
