In [1]:
import numpy as np
from tqdm import tqdm
import time
import concurrent.futures

def load_chunk(npz_file, chunk_keys):
    chunk_dict = {}
    for key in chunk_keys:
        chunk_dict[key] = npz_file[key]
    return chunk_dict

def load_npz_with_progress(filename, chunk_size=50):
    # Open the npz file
    npz_file = np.load(filename, allow_pickle=True)
    
    # Get the keys and initialize the dictionary
    keys = npz_file.files
    
    # Split the keys into chunks
    total_chunks = len(keys) // chunk_size + (1 if len(keys) % chunk_size != 0 else 0)
    chunks = [keys[i*chunk_size:(i+1)*chunk_size] for i in range(total_chunks)]
    
    embed_dict = {}
    
    # Use concurrent.futures to load chunks in parallel
    with tqdm(total=total_chunks, desc='Loading embeddings') as pbar:
        with concurrent.futures.ThreadPoolExecutor() as executor:
            future_to_chunk = {executor.submit(load_chunk, npz_file, chunk): chunk for chunk in chunks}
            for future in concurrent.futures.as_completed(future_to_chunk):
                chunk_dict = future.result()
                embed_dict.update(chunk_dict)
                pbar.update(1)
    
    return embed_dict




In [2]:
# Define the path to your NPZ file
filename = './data/Dataset/embeddings/Test_TM_Vec.npz'

# Load the NPZ file with progress
Train_embed_dict = load_npz_with_progress(filename)


Loading embeddings:  77%|███████▋  | 15912/20714 [4:34:19<1:22:47,  1.03s/it]


AssertionError: 

In [None]:
# Optionally, print size of the dictionary
print(f"Size of the dictionary: {len(Train_embed_dict)}")

with open('embedding_keys.txt', 'w') as f:
    f.write('\n'.join(sorted(list(Train_embed_dict.keys()))))

X_train = list(Train_embed_dict.values())
X_train = np.array(X_train)