In [243]:
from typing import List, Tuple, Dict, Optional, List
from datetime import time
import pandas as pd
import numpy as np
import zarr
import sys
import os

## Merge Function

Implement a function called `merge_zarr_datasets` that takes a list of datasets and merges them according to the following specifications

1. Each dataset is represented as a dictionary where:
    - The keys are `array_type` strings
    - The values are tuples containing (zarr_array, feature_list)
    - All `array_types` within a dataset have the same number of rows and the same row order.
2. The function should merge datasets by combining arrays of the same `array_type` across all datasets.
3. If an `array_type` is missing in one or more datasets, it should be generated on the fly with zero values, matching the shape of existing arrays of that type in the dataset.
4. All arrays contain int64 values.
5. For each `array_type`, create a union of all features from all datasets. The merged array should include all these features, filling with zeros where necessary.
6. The rows from all datasets should be combined and randomized in the output, not simply concatenated serially. The randomization should maintain the relationship between rows across all `array_types`.
7. The function should return a merged dataset in the same format as the input datasets.
8. The solution should be optimized to handle up to 5 million rows in the merged dataset, potentially merging hundreds of datasets, each with 2-3 `array_types` and 50 to 50,000 features in each `array_type`.

## Solution

The solution is implemented in the `merge_zarr_datasets` function, which takes a list of datasets and returns a merged dataset according to the specifications provided.

The function works as follows:
1. It first determines the union of all `array_types` and features across all datasets.
2. It then creates a merged dataset with zero values for all `array_types` and features.
3. It iterates over each dataset and fills the merged dataset with values from the current dataset.
4. It randomizes the rows of the merged dataset to maintain the relationship between rows across all `array_types`.

Data:
- Features up to $50,000$
- Rows up to $5,000,000$
- Max matrix = $50,000 * 5,000,000 = 250,000,000,000$

Size:
- Each int64 value is $8$ bytes
- Max size = $250,000,000,000 * 8 = 2,000,000,000,000$ bytes
- $= 2,000,000,000,000 / 1024 = 1,953,125,000$ KB
- $= 1,953,125,000 / 1024 = 1,907,348$ MB
- $= 1,907,348 / 1024 = 1,863$ GB
- $= 1,863 / 1024 = 1.82$ TB



A single row with $50,000$ features would be $400,000$ bytes or $390.625$ KB. This is a reasonable size for a single row. A reasonable number of rows in a chunk would be $10,000$ rows, which would be $3.9$ GB. This is a reasonable size for a chunk.

But Blosc compressor used by Zarr has a buffer size limit of 2 GB. So, we need to keep the chunk size below 2 GB. Reducing the row per chunk to $5,000$ rows would make the chunk size $1.95$ GB, which is below the 2 GB limit.

For chunks of size $2,500$ rows, the chunk size would be $976.5625$ MB, which is also below the 2 GB limit.

In [244]:
dataset1 = {
 "type_A": (zarr.array([[1, 2, 0], [3, 4, 0]]), ["feature1", "feature2", "feature3"]),
 "type_B": (zarr.array([[5, 6], [7, 8]]), ["feature4", "feature5"])
}
dataset2 = {
 "type_A": (zarr.array([[7, 8, 9], [10, 11, 12]]), ["feature1", "feature2", "feature4"]),
 "type_C": (zarr.array([[10, 11], [12, 13]]), ["feature6", "feature7"])
}
datasets = [dataset1, dataset2]

In [245]:
def check_available_name() -> str:
    '''
    Check the available name for the new merged array
    Args:
        None
    Returns:
        str: the name of the new merged array
    '''
    folders = os.listdir()
    # check folder name starting with merged_arrays
    folders = [folder for folder in folders if folder.startswith("merged_arrays")]
    folders.sort()
    # return the next name
    return f"merged_arrays_{len(folders)+1}.zarr"

In [246]:
def print_func(merged_arrays: Dict[str, Tuple[zarr.Array, List[str]]]):
    for key, val in merged_arrays.items():
        features = val[1]
        print(f'Array Type: {key}')
        # print features
        for feat in features:
            print(feat, end='\t')
        print()
        # print data
        for row in val[0]:
            for col in row:
                print(col,' '*(10-len(str(col))), end=' '*5)
            print()
        print()

In [247]:
def permute_in_chunks(size:int, chunk_size:int, seed:int=42) -> List[int]:
    '''
    Permutate an array in chunks of a given size.
    Args:
        size: The size of the array to be permuted
        chunk_size: The size of the chunks to permute
    Returns:
        A permuted array of the given size
    Examples:
    >>> permute_in_chunks(10, 3)
    array([2, 0, 1, 5, 4, 3, 9, 7, 8, 6])
    '''
    np.random.seed(seed)
    arr = np.arange(size)
    permuted_arr = np.empty(size, dtype=int)
    
    for start in range(0, size, chunk_size):
        end = start + chunk_size
        permuted_arr[start:end] = np.random.permutation(arr[start:end])

    return list(permuted_arr)

In [248]:
res = permute_in_chunks(5000000, 1000)
size = sys.getsizeof(res)
print(f'Size in KB: {size/1024}')
print(f'Size in MB: {size/1024**2}')

Size in KB: 39062.5546875
Size in MB: 38.14702606201172


In [249]:
def make_zarr_group(total_rows:int, all_array_types:List[str], feature_mappings:Dict[str, List[str]], seed:int=42) -> zarr.Group:
    '''
    Create a zarr group with the given parameters.
    Args:
        total_rows: The total number of rows in the group
        all_array_types: A list of all array types in the group
        feature_mappings: A dictionary mapping array types to their features
        seed: The seed for the random number generator
    Returns:
        group: A zarr group with the given parameters
    '''
    group = zarr.group(store=zarr.DirectoryStore(f'{check_available_name()}'))
    group.attrs['total_rows'] = total_rows
    group.attrs['all_array_types'] = list(all_array_types)
    group.attrs['feature_mappings'] = feature_mappings
    group.attrs['seed'] = seed
    group.attrs['datetime'] = pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
    return group

In [250]:
def case_of_one(dataset:Dict[str, Tuple[zarr.Array, List[str]]]) -> Dict[str, Tuple[zarr.Array, List[str]]]:
    '''
    Return the dataset with randomised rows and features.
    Args:
        dataset: A dictionary of arrays to merge
    Returns:
        merged_array: A dictionary containing the merged array
    '''
    total_rows = dataset[list(dataset.keys())[0]][0].shape[0]
    shuffled_rows = permute_in_chunks(total_rows, 1000)
    row_hash = {array_type: shuffled_rows.copy() for array_type in dataset.keys()}
    feature_mappings = {}

    for array_type, features in dataset.items():
        permuted_features = np.random.permutation(features[1])
        feature_mappings[array_type] = {feature: idx for idx, feature in enumerate(permuted_features)}

    merged_arrays = {}
    merged_arrays_zarr = make_zarr_group(total_rows, list(dataset.keys()), feature_mappings)
    for array_type in dataset.keys():
        merged_arrays[array_type] = zarr.zeros(shape=(total_rows, len(feature_mappings[array_type])), dtype=np.int64, chunks=(10000, len(feature_mappings[array_type])), store=f'{array_type}.zarr', overwrite=True, default_fill_value=0)
        merged_arrays[array_type].attrs['features'] = list(feature_mappings[array_type])

        source_array, source_features = dataset[array_type]
        for row in range(total_rows):
            select_row = row_hash[array_type].pop(0)
            for feature in source_features:
                val = source_array.vindex[row, source_features.index(feature)]
                merged_arrays[array_type].vindex[select_row, feature_mappings[array_type][feature]] = val

    return_merged_array = {array_type: (merged_arrays[array_type], list(feature_mappings[array_type])) for array_type in dataset.keys()}
    return return_merged_array

In [251]:
def merge_zarr_datasets(datasets: List[Dict[str, Tuple[zarr.Array, List[str]]]]) -> Dict[str, Tuple[zarr.Array, List[str]]]:
    seed = 42
    np.random.seed(seed)
    
    if not datasets: # Case for empty datasets
        raise ValueError("No datasets provided")
    if len(datasets) == 1: # Case for one dataset -- no need to merge, return a randomised dataset 
        return_merged_array = case_of_one(datasets[0])
        return return_merged_array

    # Part 1: Collect all array types, their features, and calculate total rows
    all_array_types = set()
    feature_sets = {}
    total_rows = 0
    for dataset in datasets:
        # All `array_types` within a dataset have the same number of rows and the same row order. Loading the actual data can be expensive, so we only load the first dataset. Possible: Create an iterator to avoid loading all datasets. total_rows += next(iter(dataset.values()))[0].shape[0]
        total_rows += dataset[list(dataset.keys())[0]][0].shape[0]
        for array_type, (array, features) in dataset.items(): # Make a single pass through the dataset to collect all array types and features
            all_array_types.add(array_type)
            if array_type not in feature_sets:
                feature_sets[array_type] = set()
            feature_sets[array_type].update(features) # Update a set with the union of itself and others
    print('Total Rows:', total_rows)
    # Part 2: Create a Feature Mapping (Hash Table) for each array type
    feature_mappings = {} # This will be useful when we need to map features to their respective columns
    for array_type, features in feature_sets.items():
        permuted_features = np.random.permutation(list(features)) # Randomly permute the features
        feature_mappings[array_type] = {feature: idx for idx, feature in enumerate(permuted_features)}
    
    print(feature_mappings)

    # Part 3: Create a new dataset with the merged data
    merged_arrays = {}
    merged_arrays_zarr = make_zarr_group(total_rows, all_array_types, feature_mappings)

    for array_type in all_array_types:
        num_features = len(feature_sets[array_type])
        # A new merged array for each array type
        merged_arrays[array_type] = merged_arrays_zarr.zeros(shape=(total_rows, num_features), dtype=np.int64, name=f'{array_type}.zarr', overwrite=True)#, chunks=(2500, num_features)
        merged_arrays[array_type].attrs['features'] = list(feature_sets[array_type])

    # Part 4: Merge the data
    # We will iterate through each dataset and merge the data into the new dataset
    # We will create a hash for each row which is filled -- this will help us when filling the merged dataset randomly
    # permutate in chunks of 1000
    shuffled_rows = permute_in_chunks(total_rows, 2500)
    row_hash = {arr_type: shuffled_rows.copy() for arr_type in all_array_types} #Explicitly copy the list to avoid reference issues & Maintain randomization across all rows

    for dataset in datasets:
        print("Starting new dataset")
        in_rows = dataset[list(dataset.keys())[0]][0].shape[0] # Number of rows in current dataset
        # print(in_rows)
        for array_type in all_array_types:
            # print('\tRunning for array type:', array_type)
            if array_type == 'type_C' or array_type == 'type_B':
                continue
            if array_type in dataset.keys():
                source_array, source_features = dataset[array_type]
                # Map the features to their respective columns
                for rows in range(in_rows):
                    # print(f'\t\tRunning for row: {rows}/{in_rows}',end='\r')
                    # get the row number to be filled in the merged dataset -- we are ensuring that we fill the merged dataset randomly but in optimal fashion -- one chunk at a time.
                    select_row = row_hash[array_type].pop(0)
                    
                    # Order source features based on the feature_sets[array_type]
                    source_features_ordered = [feature_mappings[array_type][feature] for feature in source_features]
                    # print('\t\t',source_features)
                    source_features_data = source_array[rows, :]
                    # print('\t\t',source_features_data)
                    # print('\t\t',feature_mappings[array_type])
                    # print('\t\t',source_features_ordered)
                    
                    feed = zarr.zeros(shape=(len(feature_mappings[array_type])), dtype=np.int64)
                    feed[source_features_ordered] = source_features_data
                    # print('\t\tFeed:',list(feed))
                    # print('\t\tRow:',select_row)

                    merged_arrays[array_type][select_row, :] = feed
                    # print('\t\tMerge:',list(merged_arrays[array_type]))
                    # print()

                    # for feature in source_features:
                    #     # Not Optimized -- Loaded everytime and stored everytime -- figure out bulk assignment
                    #     merged_arrays[array_type].vindex[select_row, feature_mappings[array_type][feature]] = source_array.vindex[rows, source_features.index(feature)]
                        # merged_arrays[array_type][select_row, feature_mappings[array_type][feature]] = source_array[rows, source_features.index(feature)]
                        # Bulk assignment
                        # merged_arrays[array_type][select_row, feature_mappings[array_type][feature]] = source_array[rows, source_features.index(feature)]

    return_merged_array = {}
    for array_type in all_array_types:
        return_merged_array[array_type] = (merged_arrays[array_type], list(feature_mappings[array_type]))
    return return_merged_array

In [252]:
%time merged_arrays = merge_zarr_datasets(datasets)

Total Rows: 4
{'type_A': {'feature4': 0, 'feature2': 1, 'feature1': 2, 'feature3': 3}, 'type_B': {'feature4': 0, 'feature5': 1}, 'type_C': {'feature6': 0, 'feature7': 1}}
Starting new dataset
Starting new dataset
CPU times: user 8.22 ms, sys: 17.7 ms, total: 25.9 ms
Wall time: 75.5 ms


In [253]:
print_func(merged_arrays)

Array Type: type_C
feature6	feature7	
0               0               
0               0               
0               0               
0               0               

Array Type: type_A
feature4	feature2	feature1	feature3	
9               8               7               0               
0               2               1               0               
12              11              10              0               
0               4               3               0               

Array Type: type_B
feature4	feature5	
0               0               
0               0               
0               0               
0               0               



In [213]:
def verify_randomization(original_datasets: List[Dict[str, Tuple[zarr.Array, List[str]]]], merged_dataset: Dict[str, Tuple[zarr.Array, List[str]]]) -> bool:
    '''
    Verify that the merged dataset is a randomization of the original datasets.
    Args:
        original_datasets: The original datasets to be merged
        merged_dataset: The merged dataset
    Returns:
        bool: True if the merged dataset is a randomization of the original datasets, False otherwise
    '''
    # Part 1: Collect all array types, their features, and calculate total rows
    all_array_types = set()
    feature_sets = {}
    total_rows = merged_dataset[list(merged_dataset.keys())[0]][0].shape[0]
    for array_type, (array, features) in merged_dataset.items():
        all_array_types.add(array_type)
        if array_type not in feature_sets:
            feature_sets[array_type] = set()
        feature_sets[array_type].update(features)
    
    # Part 2: Create a Feature Mapping (Hash Table) for each array type
    feature_mappings = {
        array_type: {feature: idx for idx, feature in enumerate(features)}
        for array_type, features in feature_sets.items()
    }

    # Part 3: Verify the randomization
    for array_type in all_array_types:
        merged_array, merged_features = merged_dataset[array_type]
        for dataset in original_datasets:
            original_arr_types = set(dataset.keys())
            if array_type not in original_arr_types:
                continue
            original_array, original_features = dataset[array_type]
            original_features_mapped = [feature_mappings[array_type][feature] for feature in original_features]
            original_features_mapped = np.sort(original_features_mapped)
            merged_array_feature_indexed = merged_array[:, original_features_mapped]
            original_array_ordered = original_array[:, np.argsort(original_features_mapped)]
            for row in range(len(original_array_ordered)):
                original_row = original_array_ordered[row]
                if original_row not in merged_array_feature_indexed:
                    print(f'Row {original_row} from {array_type} not found in the merged dataset {merged_array_feature_indexed}')
                    return False
    return True

In [88]:
verify_randomization(datasets, merged_arrays)

True

In [122]:
# Make test sets to verify the function
dataset_a1 = {
    "type_A": (zarr.array(np.random.randint(0, 10, (10, 10))), [f"feature{i}" for i in range(10)]),
    "type_B": (zarr.array(np.random.randint(0, 10, (10, 10))), [f"feature{i}" for i in range(10)])
    }
dataset_a2 = {
    "type_A": (zarr.array(np.random.randint(0, 10, (10, 10))), [f"feature{i}" for i in range(10)]),
    "type_C": (zarr.array(np.random.randint(0, 10, (10, 10))), [f"feature{i}" for i in range(10)]),
    "type_D": (zarr.array(np.random.randint(0, 10, (10, 10))), [f"feature{i}" for i in range(10)])
}
dataset_a3 = {
    "type_A": (zarr.array(np.random.randint(0, 10, (10, 10))), [f"feature{i}" for i in range(10)]),
    "type_B": (zarr.array(np.random.randint(0, 10, (10, 10))), [f"feature{i}" for i in range(10)]),
    "type_C": (zarr.array(np.random.randint(0, 10, (10, 10))), [f"feature{i}" for i in range(10)]),
    "type_D": (zarr.array(np.random.randint(0, 10, (10, 10))), [f"feature{i}" for i in range(10)])
}

test_set1 = [dataset_a1, dataset_a2, dataset_a3]

# Larger test set using store and 100 rows and 100 columns
dataset_b1 = {
    "type_A": (zarr.array(np.random.randint(0, 10, (10, 100))), [f"feature{i}" for i in range(100)]),
    "type_B": (zarr.array(np.random.randint(0, 10, (10, 100))), [f"feature{i}" for i in range(100)])
    }
dataset_b2 = {
    "type_A": (zarr.array(np.random.randint(0, 10, (10, 100))), [f"feature{i}" for i in range(100)]),
    "type_C": (zarr.array(np.random.randint(0, 10, (10, 100))), [f"feature{i}" for i in range(100)]),
    "type_D": (zarr.array(np.random.randint(0, 10, (10, 100))), [f"feature{i}" for i in range(100)])
}
dataset_b3 = {
    "type_A": (zarr.array(np.random.randint(0, 10, (10, 100))), [f"feature{i}" for i in range(100)]),
    "type_B": (zarr.array(np.random.randint(0, 10, (10, 100))), [f"feature{i}" for i in range(100)]),
    "type_C": (zarr.array(np.random.randint(0, 10, (10, 100))), [f"feature{i}" for i in range(100)]),
    "type_D": (zarr.array(np.random.randint(0, 10, (10, 100))), [f"feature{i}" for i in range(100)])
}
test_set2 = [dataset_b1, dataset_b2, dataset_b3]

dataset_c1 = {
    "type_A": (zarr.array(np.random.randint(0, 10, (10, 1000))), [f"feature{i}" for i in range(1000)]),
    "type_B": (zarr.array(np.random.randint(0, 10, (10, 1000))), [f"feature{i}" for i in range(1000)])
    }
dataset_c2 = {
    "type_A": (zarr.array(np.random.randint(0, 10, (10, 1000))), [f"feature{i}" for i in range(1000)]),
    "type_C": (zarr.array(np.random.randint(0, 10, (10, 1000))), [f"feature{i}" for i in range(1000)]),
    "type_D": (zarr.array(np.random.randint(0, 10, (10, 1000))), [f"feature{i}" for i in range(1000)])
}
dataset_c3 = {
    "type_A": (zarr.array(np.random.randint(0, 10, (10, 1000))), [f"feature{i}" for i in range(1000)]),
    "type_B": (zarr.array(np.random.randint(0, 10, (10, 1000))), [f"feature{i}" for i in range(1000)]),
    "type_C": (zarr.array(np.random.randint(0, 10, (10, 1000))), [f"feature{i}" for i in range(1000)]),
    "type_D": (zarr.array(np.random.randint(0, 10, (10, 1000))), [f"feature{i}" for i in range(1000)])
}
test_set3 = [dataset_c1, dataset_c2, dataset_c3]

dataset_d1 = {
    "type_A": (zarr.array(np.random.randint(0, 10, (10, 10000))), [f"feature{i}" for i in range(10000)]),
    "type_B": (zarr.array(np.random.randint(0, 10, (10, 10000))), [f"feature{i}" for i in range(10000)])
    }
dataset_d2 = {
    "type_A": (zarr.array(np.random.randint(0, 10, (10, 10000))), [f"feature{i}" for i in range(10000)]),
    "type_C": (zarr.array(np.random.randint(0, 10, (10, 10000))), [f"feature{i}" for i in range(10000)]),
    "type_D": (zarr.array(np.random.randint(0, 10, (10, 10000))), [f"feature{i}" for i in range(10000)])
}
dataset_d3 = {
    "type_A": (zarr.array(np.random.randint(0, 10, (10, 10000))), [f"feature{i}" for i in range(10000)]),
    "type_B": (zarr.array(np.random.randint(0, 10, (10, 10000))), [f"feature{i}" for i in range(10000)]),
    "type_C": (zarr.array(np.random.randint(0, 10, (10, 10000))), [f"feature{i}" for i in range(10000)]),
    "type_D": (zarr.array(np.random.randint(0, 10, (10, 10000))), [f"feature{i}" for i in range(10000)])
}
test_set4 = [dataset_d1, dataset_d2, dataset_d3]

dataset_e1 = {
    "type_A": (zarr.array(np.random.randint(0, 10, (10, 100000))), [f"feature{i}" for i in range(100000)]),
    "type_B": (zarr.array(np.random.randint(0, 10, (10, 100000))), [f"feature{i}" for i in range(100000)])
    }
dataset_e2 = {
    "type_A": (zarr.array(np.random.randint(0, 10, (10, 100000))), [f"feature{i}" for i in range(100000)]),
    "type_C": (zarr.array(np.random.randint(0, 10, (10, 100000))), [f"feature{i}" for i in range(100000)]),
    "type_D": (zarr.array(np.random.randint(0, 10, (10, 100000))), [f"feature{i}" for i in range(100000)]
    )
}
dataset_e3 = {
    "type_A": (zarr.array(np.random.randint(0, 10, (10, 100000))), [f"feature{i}" for i in range(100000)]),
    "type_B": (zarr.array(np.random.randint(0, 10, (10, 100000))), [f"feature{i}" for i in range(100000)]),
    "type_C": (zarr.array(np.random.randint(0, 10, (10, 100000))), [f"feature{i}" for i in range(100000)]),
    "type_D": (zarr.array(np.random.randint(0, 10, (10, 100000))), [f"feature{i}" for i in range(100000)])
}
test_set5 = [dataset_e1, dataset_e2, dataset_e3]

In [240]:
%time merged_data_a1 = merge_zarr_datasets(test_set1)

Total Rows: 30
{'type_A': {'feature8': 0, 'feature1': 1, 'feature9': 2, 'feature2': 3, 'feature3': 4, 'feature6': 5, 'feature7': 6, 'feature5': 7, 'feature0': 8, 'feature4': 9}, 'type_B': {'feature2': 0, 'feature1': 1, 'feature8': 2, 'feature9': 3, 'feature0': 4, 'feature5': 5, 'feature3': 6, 'feature7': 7, 'feature4': 8, 'feature6': 9}, 'type_C': {'feature7': 0, 'feature6': 1, 'feature2': 2, 'feature4': 3, 'feature8': 4, 'feature9': 5, 'feature0': 6, 'feature3': 7, 'feature1': 8, 'feature5': 9}, 'type_D': {'feature1': 0, 'feature3': 1, 'feature4': 2, 'feature6': 3, 'feature8': 4, 'feature2': 5, 'feature0': 6, 'feature5': 7, 'feature9': 8, 'feature7': 9}}
Starting new dataset
Starting new dataset
Starting new dataset
CPU times: user 50 ms, sys: 18.2 ms, total: 68.2 ms
Wall time: 73.6 ms


In [150]:
%time merged_data_a1 = merge_zarr_datasets(test_set1)

Total Rows: 30
Starting new dataset
	Running for array type: type_C
	Running for array type: type_D
	Running for array type: type_A
	Running for array type: type_B
Starting new dataset/10
	Running for array type: type_C
	Running for array type: type_D
	Running for array type: type_A
	Running for array type: type_B
Starting new dataset
	Running for array type: type_C
	Running for array type: type_D
	Running for array type: type_A
	Running for array type: type_B
CPU times: user 419 ms, sys: 150 ms, total: 569 ms
Wall time: 568 ms


In [241]:
%time merged_data_b1 = merge_zarr_datasets(test_set2)

Total Rows: 30
{'type_A': {'feature20': 0, 'feature70': 1, 'feature4': 2, 'feature74': 3, 'feature15': 4, 'feature84': 5, 'feature53': 6, 'feature43': 7, 'feature48': 8, 'feature35': 9, 'feature56': 10, 'feature69': 11, 'feature8': 12, 'feature12': 13, 'feature19': 14, 'feature23': 15, 'feature11': 16, 'feature46': 17, 'feature32': 18, 'feature64': 19, 'feature36': 20, 'feature5': 21, 'feature92': 22, 'feature87': 23, 'feature30': 24, 'feature71': 25, 'feature72': 26, 'feature76': 27, 'feature57': 28, 'feature73': 29, 'feature85': 30, 'feature98': 31, 'feature47': 32, 'feature94': 33, 'feature52': 34, 'feature60': 35, 'feature37': 36, 'feature22': 37, 'feature50': 38, 'feature41': 39, 'feature67': 40, 'feature96': 41, 'feature1': 42, 'feature66': 43, 'feature93': 44, 'feature68': 45, 'feature13': 46, 'feature29': 47, 'feature91': 48, 'feature18': 49, 'feature88': 50, 'feature49': 51, 'feature63': 52, 'feature14': 53, 'feature3': 54, 'feature26': 55, 'feature2': 56, 'feature77': 57, 'fe

In [151]:
%time merged_data_b1 = merge_zarr_datasets(test_set2)

Total Rows: 30
Starting new dataset
	Running for array type: type_C
	Running for array type: type_D
	Running for array type: type_A
	Running for array type: type_B
Starting new dataset/10
	Running for array type: type_C
	Running for array type: type_D
	Running for array type: type_A
	Running for array type: type_B
Starting new dataset
	Running for array type: type_C
	Running for array type: type_D
	Running for array type: type_A
	Running for array type: type_B
CPU times: user 5.39 s, sys: 999 ms, total: 6.39 s
Wall time: 6.4 s


In [242]:
%time set3 = merge_zarr_datasets(test_set3)

Total Rows: 30
{'type_A': {'feature140': 0, 'feature878': 1, 'feature312': 2, 'feature880': 3, 'feature181': 4, 'feature457': 5, 'feature226': 6, 'feature60': 7, 'feature105': 8, 'feature239': 9, 'feature56': 10, 'feature823': 11, 'feature386': 12, 'feature240': 13, 'feature466': 14, 'feature426': 15, 'feature777': 16, 'feature592': 17, 'feature201': 18, 'feature328': 19, 'feature946': 20, 'feature972': 21, 'feature851': 22, 'feature991': 23, 'feature664': 24, 'feature929': 25, 'feature396': 26, 'feature113': 27, 'feature812': 28, 'feature249': 29, 'feature368': 30, 'feature379': 31, 'feature789': 32, 'feature639': 33, 'feature820': 34, 'feature917': 35, 'feature603': 36, 'feature485': 37, 'feature523': 38, 'feature161': 39, 'feature918': 40, 'feature333': 41, 'feature767': 42, 'feature86': 43, 'feature142': 44, 'feature643': 45, 'feature764': 46, 'feature447': 47, 'feature985': 48, 'feature175': 49, 'feature64': 50, 'feature809': 51, 'feature905': 52, 'feature981': 53, 'feature441': 5

In [252]:
np.random.permutation(10000)

array([6883, 5836, 8601, ..., 6265, 5734, 5191])

In [6]:
import sys
arr = zarr.zeros(shape=(10000, 5000), dtype=np.int64, chunks=(5000, 50000), overwrite=True, store='check.zarr')
arr[:] = np.random.randint(0, 100, (10000, 5000))
metadata_size = sys.getsizeof(arr)
print(f"Metadata size: {metadata_size} bytes")

'Metadata size: 48 bytes'


In [7]:
arr.info

0,1
Type,zarr.core.Array
Data type,int64
Shape,"(10000, 5000)"
Chunk shape,"(5000, 50000)"
Order,C
Read-only,False
Compressor,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)"
Store type,zarr.storage.DirectoryStore
No. bytes,400000000 (381.5M)
No. bytes stored,66218308 (63.2M)


In [8]:
sys.getsizeof(arr)

48

In [9]:
sys.getsizeof(arr[...])

400000128

In [10]:
sys.getsizeof(arr[0:50, :])

2000128