In [23]:
import os
import h5py
import numpy as np
import cv2
import pandas as pd
import random

In [24]:
def process_image(image, target_size):
    h, w = image.shape[:2]
    boundary = max(h, w)
    scale = target_size / boundary

    """
    new_h = int(h * scale)
    new_w = int(w * scale)
    resized_img = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
    canvas = np.zeros((target_size, target_size, 3), dtype=np.uint8)

    x_offset = (target_size - new_h) // 2
    y_offset = (target_size - new_w) // 2
    canvas[x_offset:x_offset + new_h, y_offset:y_offset + new_w] = resized_img
    """
    
    canvas = cv2.resize(image, (target_size, target_size))
    canvas = np.transpose(canvas, (2, 0, 1))
    return canvas

In [25]:
def create_h5_dataset(loader, class_names, output_path):
    X = []
    Y = []
    for n, item in enumerate(loader):
        X.append(item[0])
        Y.append(item[1])
        
    if os.path.exists(output_path):
        os.remove(output_path)
        print(f"Remove existing {output_path} dataset")
    
    with h5py.File(output_path, 'w') as hf:
        hf.create_dataset("images", data=X)
        hf.create_dataset("labels", data=Y)
        hf.create_dataset("class_names", data=class_names)
    
    print("Dataset created successfully.")
    
    with h5py.File(output_path, 'r') as hf:
        labels = hf['labels'][:]
        images = hf['images'][:]
        class_names = hf['class_names'][:]

In [26]:
source_dir = 'campus_images'
images = []
labels = []

counter = 1
for root, dirs, files in os.walk(source_dir): 
    for file in files:
        if file.lower().endswith('.jpg'):
            file_path = os.path.join(root, file)
            label = os.path.basename(root)

            img = cv2.imread(file_path)
            if img is None:
                print(f"Could not read {file_path}.")
                continue

            if label == '.ipynb_checkpoints':
                continue

            print(f"Processing image No.{counter}: {file_path}")
            new_img = process_image(img, target_size=227)

            images.append(new_img)
            labels.append(label)
            counter += 1

Processing image No.1: campus_images/Lise-Meitner-Str-9_9377/Indoor/9377_EG/HW_709/HW_709_155.jpg
Processing image No.2: campus_images/Lise-Meitner-Str-9_9377/Indoor/9377_EG/HW_709/HW_709_169.JPG
Processing image No.3: campus_images/Lise-Meitner-Str-9_9377/Indoor/9377_EG/HW_709/HW_709_196.jpg
Processing image No.4: campus_images/Lise-Meitner-Str-9_9377/Indoor/9377_EG/HW_709/HW_709_357.jpg
Processing image No.5: campus_images/Lise-Meitner-Str-9_9377/Indoor/9377_EG/HW_709/HW_709_26.jpg
Processing image No.6: campus_images/Lise-Meitner-Str-9_9377/Indoor/9377_EG/HW_709/HW_709_27.jpg
Processing image No.7: campus_images/Lise-Meitner-Str-9_9377/Indoor/9377_EG/HW_709/HW_709_356.jpg
Processing image No.8: campus_images/Lise-Meitner-Str-9_9377/Indoor/9377_EG/HW_709/HW_709_197.jpg
Processing image No.9: campus_images/Lise-Meitner-Str-9_9377/Indoor/9377_EG/HW_709/HW_709_183.JPG
Processing image No.10: campus_images/Lise-Meitner-Str-9_9377/Indoor/9377_EG/HW_709/HW_709_168.JPG
Processing image No.1

In [27]:
example = np.transpose(images[0], (1, 2, 0))
status = cv2.imwrite(os.path.join(os.getcwd(), 'example.jpg'), example)
print("Image written to file-system : ", status)

Image written to file-system :  True


In [28]:
len(images) == len(labels)

True

In [29]:
df = pd.DataFrame({'Room': labels})
one_hot = pd.get_dummies(df['Room'], dtype=int)
X = np.array(images, dtype=np.uint8)
Y = one_hot.to_numpy()
class_names = one_hot.columns.to_numpy(dtype='S')
print(f"X shape: {X.shape}  Y shape: {Y.shape}  Name shape: {class_names.shape}")

X shape: (3145, 3, 227, 227)  Y shape: (3145, 23)  Name shape: (23,)


In [30]:
X[0].shape

(3, 227, 227)

In [31]:
data_loader = []
for i in range(Y.shape[0]):
    data_loader.append((X[i], Y[i]))

print(f"Dataset length: {len(data_loader)}")

random.seed(0)
random.shuffle(data_loader)

train_data_loader = data_loader[:2800]
test_data_loader = data_loader[2800:]

Dataset length: 3145


In [32]:
create_h5_dataset(train_data_loader, class_names, 'train_dataset_nopad.h5')
create_h5_dataset(test_data_loader, class_names, 'test_dataset_nopad.h5')

Dataset created successfully.
Dataset created successfully.


In [33]:
import h5py
import numpy as np

with h5py.File('test_dataset_nopad.h5', 'r') as hf:
    print(hf.keys())
    labels = hf['labels'][:]

<KeysViewHDF5 ['class_names', 'images', 'labels']>


In [34]:
labels.shape

(345, 23)

In [35]:
labels.sum(axis=0)

array([18, 19, 13, 25, 11,  9,  8, 17, 14, 18, 18, 18,  9,  8,  5, 20, 33,
        8, 17, 11, 11, 20, 15])