# Data Processing

This workbook contains the code to dereference the HDF5 files, extract the correct file names, and create a dictionary with the digit locations within each image and the corresponding file name.

In [None]:
import pandas as pd
import h5py
import numpy as np
from PIL import Image
import json

In [None]:
def dereference_data(file, obj):
    """
    Recursively dereferences data in an HDF5 file.
    - file: the open HDF5 file object
    - obj: the object (dataset, group, or reference) to dereference
    """
    if isinstance(obj, h5py.Dataset):
        # If it's a dataset with references, we need to check its type
        if obj.dtype == 'O':  # dtype 'O' contains object references
            data = []
            # iterate through each reference in the dataset
            for ref in obj: 
                # if the element is a reference, dereference it and process it recursively
                if isinstance(ref, h5py.Reference):
                    data.append(dereference_data(file, file[ref])) 
                # If it's an array, iterate over elements and dereference them
                elif isinstance(ref, np.ndarray):
                    nested_data = [dereference_data(file, file[r]) for r in ref]
                    data.append(nested_data)
                # Otherwise, append the element as-is
                else:
                    data.append(ref)
            return data
        # If the dataset doesn't contain references, return its content as a NumPy array
        else:
            return obj[:] 

    # If the object is an HDF5 group, recursively process its members
    elif isinstance(obj, h5py.Group):
        data = {}
        for key, item in obj.items():
            # Store the key-value pair, where the value is recursively dereferenced
            data[key] = dereference_data(file, item)
        return data

     # If it's a single reference, dereference and fetch its data
    elif isinstance(obj, h5py.Reference):
        return dereference_data(file, file[obj])

     # If the object is a NumPy array of references, process each element
    elif isinstance(obj, np.ndarray):
        data = [dereference_data(file, file[ref]) if isinstance(ref, h5py.Reference) else ref for ref in obj]
        return data

    # Return as-is if it's a basic data type or unhandled type
    else:
        return obj

# Main function to open the .mat file and process the digitStruct group
def load_mat_to_dict(filepath):
    """
    Loads a .mat file (HDF5 format) and processes its 'digitStruct' group into a Python dictionary.
    - filepath: path to the .mat file
    Returns:
    - A dictionary representing the content of the 'digitStruct' group with all references dereferenced.
    """
    with h5py.File(filepath, 'r') as mat_file:
        # Initialize the dictionary
        data_dict = {}
        # Access the 'digitStruct' group within the HDF5 file
        digit_struct = mat_file['digitStruct']
        for key in digit_struct.keys():
            # Recursively dereference each key in 'digitStruct' and store it in the dictionary
            data_dict[key] = dereference_data(mat_file, digit_struct[key])

    return data_dict

# run function
train_data_dict = load_mat_to_dict('local file path')

In [None]:
# example of how to extract a single digit from an image
image = Image.open("train/1.png")
crop_coords = (246, 77, 246+81, 77+219)
cropped_image = image.crop(crop_coords)
cropped_image.show()

In [None]:
with h5py.File('local file path', 'r') as mat_file:
    # Access the 'name' dataset within 'digitStruct'
    name_dataset = mat_file['digitStruct']['name']
    
    # Initialize a list to store dereferenced names as strings
    dereferenced_names = []
    
    # Loop through each reference in the 'name' dataset
    for i in range(name_dataset.shape[0]):
        ref = name_dataset[i, 0]
        
        # Dereference the object and retrieve the actual data
        if isinstance(ref, h5py.Reference):
            # Retrieve the object the reference points to
            actual_data = mat_file[ref][:]
            
            # If the actual data is a numpy array of uint16 (representing string characters)
            if isinstance(actual_data, np.ndarray) and actual_data.dtype == np.uint16:
                # Convert the array of uint16 to a string
                string_data = ''.join(chr(c) for c in actual_data.flatten())
                dereferenced_names.append(string_data)
            # otherwise just append the object the reference points to
            else:
                dereferenced_names.append(actual_data)
        else:
            # Append as-is if not a reference (rare but sometimes data may be mixed)
            dereferenced_names.append(ref)

# append to the dictionary the file names
train_data_dict["name"] = dereferenced_names
train_data_dict["name"]

In [None]:
# for each name add it to the correct location in the dictionary
for i, bbox_entry in enumerate(train_data_dict["bbox"]):
    # Add "name" to the current bbox entry if not present
    if "name" not in bbox_entry[0]:
        bbox_entry[0]["name"] = set()

    # Add the corresponding name to the "name" key
    if i < len(train_data_dict["name"]):  # Ensure we don't go out of bounds
        bbox_entry[0]["name"].add(train_data_dict["name"][i])

train_data_dict

In [None]:
# remove the addtional name key in the dictionary
del train_data_dict["name"]
train_data_dict

In [None]:
def clean_and_convert(obj):
    """
    Cleans and converts data structures into a standardized format:
    - Converts NumPy arrays to Python lists.
    - Converts sets to lists.
    - Flattens nested lists.
    - Recursively processes dictionaries, lists, and other nested structures.

    Args:
    - obj: The input object to clean and convert.

    Returns:
    - A cleaned and converted version of the object.
    """
    # convert NumPy arrays to lists
    if isinstance(obj, np.ndarray):
        return obj.tolist()
    # Convert sets to lists
    elif isinstance(obj, set):
        return list(obj)
    elif isinstance(obj, list):
        # Flatten lists
        flattened = []
        for element in obj:
            cleaned_element = clean_and_convert(element)
            if isinstance(cleaned_element, list):
                flattened.extend(cleaned_element)  # Flatten nested lists
            else:
                flattened.append(cleaned_element)
        return flattened
    elif isinstance(obj, dict):
        return {key: clean_and_convert(value) for key, value in obj.items()}
    return obj

# Clean and serialize the data
cleaned_data = clean_and_convert(train_data_dict)

# dump the dictionary to a json for use later
with open("test_data.json", "w") as file:
    json.dump(cleaned_data, file)