## First: Load file from the Huggingface website

For this we need to define a dictionary, that includes the file_name as the key and file_name.type as the value. Then we can proceed with loading the file from Huggingface.

In [1]:
def load_dataset_from_huggingface(data_files, path = "LEAP/subsampled_low_res"):
    from datasets import load_dataset
    from tqdm import tqdm
    import numpy as np

    dataset = load_dataset(path, data_files=data_files)
    
    for f in data_files.keys():
        data_file = f

    combined_lists = []
    for key in tqdm(dataset[data_file].features.keys()):
        combined_lists.append(dataset[data_file][key])

    hf_array = np.array(combined_lists, dtype=np.float32)
    hf_array = np.transpose(hf_array)
    
    return hf_array
    

## Second: Load file from the local system

For this we just need the file_name and the root_path where it is stored in your local system.

In [5]:
from climsim_utils.data_utils import *

def load_original_dataset(file_name, root_path):

    grid_path = root_path + 'GitHub/ClimSim/grid_info/ClimSim_low-res_grid-info.nc'
    norm_path = root_path + 'GitHub/ClimSim/preprocessing/normalizations/'

    grid_info = xr.open_dataset(grid_path)
    input_mean = xr.open_dataset(norm_path + 'inputs/input_mean.nc')
    input_max = xr.open_dataset(norm_path + 'inputs/input_max.nc')
    input_min = xr.open_dataset(norm_path + 'inputs/input_min.nc')
    output_scale = xr.open_dataset(norm_path + 'outputs/output_scale.nc')

    data = data_utils(grid_info = grid_info, 
                      input_mean = input_mean, 
                      input_max = input_max, 
                      input_min = input_min, 
                      output_scale = output_scale)

    data.set_to_v1_vars()
    project_path = os.path.dirname(root_path)
    data_path = os.path.join(project_path, 'GitHub/ClimSim/npy_files/') # files stored in this folder on local system
    
    final_array = data.load_npy_file(data_path + file_name)
        
    return final_array

## Third: Compare to confirm that the two files are same

Lets test using the file: scoring_input

In [3]:
data_files = {"scoring_input": "scoring_input.parquet"} # Defining the dictionary
hf_array = load_dataset_from_huggingface(data_files)
hf_array.shape

Found cached dataset parquet (/Users/shreyaverma/.cache/huggingface/datasets/LEAP___parquet/LEAP--subsampled_low_res-8e54c9bc8aab12e5/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

100%|█████████████████████████████████████████| 124/124 [05:23<00:00,  2.61s/it]


(1681920, 124)

In [6]:
root_path = "/Users/shreyaverma/Documents/"  # Giving the root path
file_name = 'scoring_input.npy'  # Giving the file name
final_array = load_original_dataset(file_name, root_path)
final_array.shape

(1681920, 124)

In [7]:
hf_array.shape == final_array.shape # Gives true, confirming we're successful in loading data from huggingface

True