In [None]:
import pandas as pd
import h5py
import numpy as np
from PIL import Image
import json

In [None]:
def dereference_data(file, obj):
    """
    Recursively dereferences data in an HDF5 file.
    - file: the open HDF5 file object
    - obj: the object (dataset, group, or reference) to dereference
    """
    if isinstance(obj, h5py.Dataset):
        # If it's a dataset with references, we need to check its type
        if obj.dtype == 'O':  # dtype 'O' likely contains object references
            data = []
            for ref in obj:
                if isinstance(ref, h5py.Reference):
                    data.append(dereference_data(file, file[ref]))  # Dereference each object
                elif isinstance(ref, np.ndarray):
                    # If it's an array, iterate over elements
                    nested_data = [dereference_data(file, file[r]) for r in ref]
                    data.append(nested_data)
                else:
                    data.append(ref)
            return data
        else:
            return obj[:]  # Otherwise, return the dataset content directly

    elif isinstance(obj, h5py.Group):
        # If it's a group, recursively dereference each key
        data = {}
        for key, item in obj.items():
            data[key] = dereference_data(file, item)
        return data

    elif isinstance(obj, h5py.Reference):
        # If it's a single reference, dereference and fetch its data
        return dereference_data(file, file[obj])

    elif isinstance(obj, np.ndarray):
        # For numpy arrays of references, iterate through each element
        data = [dereference_data(file, file[ref]) if isinstance(ref, h5py.Reference) else ref for ref in obj]
        return data

    else:
        return obj  # Return as-is if it's a basic data type or unhandled type

# Main function to open the .mat file and process the digitStruct group
def load_mat_to_dict(filepath):
    with h5py.File(filepath, 'r') as mat_file:
        # Initialize the dictionary
        data_dict = {}
        
        # Process each item in digitStruct
        digit_struct = mat_file['digitStruct']
        for key in digit_struct.keys():
            # Recursively dereference each key in digitStruct
            data_dict[key] = dereference_data(mat_file, digit_struct[key])

    return data_dict

# Usage example
train_data_dict = load_mat_to_dict('train/digitStruct.mat')

In [None]:
# example of how to extract a single image
image = Image.open("train/1.png")
crop_coords = (246, 77, 246+81, 77+219)
cropped_image = image.crop(crop_coords)
cropped_image.show()

In [None]:
with h5py.File('train/digitStruct.mat', 'r') as mat_file:
    # Access the 'name' dataset within 'digitStruct' (assuming the structure is 'digitStruct/name')
    name_dataset = mat_file['digitStruct']['name']
    
    # Initialize a list to store dereferenced names as strings
    dereferenced_names = []
    
    # Loop through each reference in the 'name' dataset
    for i in range(name_dataset.shape[0]):
        ref = name_dataset[i, 0]  # Access the reference at index (i, 0)
        
        # Dereference the object and retrieve the actual data
        if isinstance(ref, h5py.Reference):
            # Retrieve the object the reference points to
            actual_data = mat_file[ref][:]
            
            # If the actual data is a numpy array of uint16 (representing string characters)
            if isinstance(actual_data, np.ndarray) and actual_data.dtype == np.uint16:
                # Convert the array of uint16 to a string (assuming it's an ASCII/Unicode encoding)
                string_data = ''.join(chr(c) for c in actual_data.flatten())
                dereferenced_names.append(string_data)
            else:
                dereferenced_names.append(actual_data)
        else:
            # Append as-is if not a reference (rare but sometimes data may be mixed)
            dereferenced_names.append(ref)

train_data_dict["name"] = dereferenced_names
train_data_dict["name"]

In [None]:
test = train_data_dict["bbox"][0]
test

In [None]:
for i, bbox_entry in enumerate(train_data_dict["bbox"]):
    # Add "name" to the current bbox entry if not present
    if "name" not in bbox_entry[0]:
        bbox_entry[0]["name"] = set()

    # Add the corresponding name to the "name" key
    if i < len(train_data_dict["name"]):  # Ensure we don't go out of bounds
        bbox_entry[0]["name"].add(train_data_dict["name"][i])

train_data_dict

In [None]:
del train_data_dict["name"]
train_data_dict

In [27]:
def clean_and_convert(obj):
    if isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, set):
        return list(obj)  # Convert sets to lists
    elif isinstance(obj, list):
        # Flatten lists
        flattened = []
        for element in obj:
            cleaned_element = clean_and_convert(element)
            if isinstance(cleaned_element, list):
                flattened.extend(cleaned_element)  # Flatten nested lists
            else:
                flattened.append(cleaned_element)
        return flattened
    elif isinstance(obj, dict):
        return {key: clean_and_convert(value) for key, value in obj.items()}
    return obj

# Clean and serialize the data
cleaned_data = clean_and_convert(train_data_dict)

with open("train_data.json", "w") as file:
    json.dump(cleaned_data, file)