In [1]:
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
def load_images_and_labels(data_dir, img_size=(128, 128)):
    """
    Load images and their corresponding labels from the specified directory.
    Args:
        data_dir (str): Path to the directory containing subdirectories for each class.
        img_size (tuple): Size to resize each image to (width, height).

    Returns:
        np.array: Array of images.
        np.array: Array of labels.
    """
    images, labels = [], []
    for label in os.listdir(data_dir):
        label_path = os.path.join(data_dir, label)
        if os.path.isdir(label_path):
            for img_name in os.listdir(label_path):
                img_path = os.path.join(label_path, img_name)
                try:
                    img = cv2.imread(img_path)
                    img = cv2.resize(img, img_size)
                    images.append(img)
                    labels.append(label)
                except Exception as e:
                    print(f"Error loading image {img_path}: {e}")
    return np.array(images), np.array(labels)

In [3]:
def preprocess_and_save(data_dir, save_path, img_size=(128, 128)):
    """
    Preprocess images and labels, then save them to a compressed .npz file.
    Args:
        data_dir (str): Path to the directory containing subdirectories for each class.
        save_path (str): Path to save the processed data (.npz file).
        img_size (tuple): Size to resize each image to (width, height).
    """
    print(f"Loading data from {data_dir}...")
    X, y = load_images_and_labels(data_dir, img_size)
    print(f"Loaded {len(X)} images.")

    # Normalize images to [0, 1] range
    X = X / 255.0

    # Encode labels to integers
    encoder = LabelEncoder()
    y = encoder.fit_transform(y)

    # Save the processed data
    print(f"Saving processed data to {save_path}...")
    np.savez(save_path, X=X, y=y, classes=encoder.classes_)
    print(f"Data saved successfully at {save_path}.")

In [4]:
# Paths to training and testing data directories
train_data_dir = "../data/raw/Training"
test_data_dir = "../data/raw/Testing"

    # Paths to save preprocessed data
train_save_path = "../data/processed/training_data.npz"
test_save_path = "../data/processed/testing_data.npz"

    # Preprocess and save training data
preprocess_and_save(train_data_dir, train_save_path)

    

Loading data from ../data/raw/Training...
Loaded 5712 images.
Saving processed data to ../data/processed/training_data.npz...
Data saved successfully at ../data/processed/training_data.npz.


In [5]:
# Preprocess and save testing data
preprocess_and_save(test_data_dir, test_save_path)

Loading data from ../data/raw/Testing...
Loaded 1311 images.
Saving processed data to ../data/processed/testing_data.npz...
Data saved successfully at ../data/processed/testing_data.npz.


In [1]:
import numpy as np
data = np.load('../data/processed/training_data.npz')
print(data.files)  # List of arrays stored in the npz file

['X', 'y', 'classes']


In [2]:
print(data['classes'])  # Access specific array by name

['glioma' 'meningioma' 'notumor' 'pituitary']


In [3]:
print(data['y'])  # Access specific array by name

[0 0 0 ... 3 3 3]


In [None]:
print(data['X'])  # Access specific array by name

[[[0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  ...
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]]

 [[0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  ...
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]]

 [[0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  ...
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]]

 ...

 [[0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  ...
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]]

 [[0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  ...
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]]

 [[0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]
  ...
  [0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]]]


In [None]:
import numpy as np
import h5py

def convert_npz_to_h5(npz_filename, h5_filename):
    # Load the .npz file
    npz_data = np.load(npz_filename)

    # Create a new HDF5 file
    with h5py.File(h5_filename, 'w') as h5f:
        for name in npz_data.files:
            data = npz_data[name]
            
            # Check if the data is of type string (dtype('<U10') or similar)
            if isinstance(data, np.ndarray) and data.dtype.kind in ['U', 'S']:
                # Convert string arrays to byte arrays (UTF-8 encoded)
                data = data.astype('S')  # Convert to byte strings (S1, S2, etc.)
            
            # Write the dataset to the HDF5 file
            h5f.create_dataset(name, data=data)

# Example usage
convert_npz_to_h5('../data/processed/training_data.npz', '../data/processed/training_data.h5')


In [17]:
convert_npz_to_h5('../data/processed/testing_data.npz', '../data/processed/testing_data.h5')