In [194]:
import numpy as np
import pandas as pd
import json
import netCDF4 as nc
import uuid
import os
from time import time_ns
from enum import Enum
import matplotlib.pyplot as plt

In [45]:
# Common functions and values
def make_test_meta():
    test_meta = {'uuid': str(uuid.uuid1()),
                'param1':12,
                'param2':'a_string',
                'param3': np.random.rand()*1e9,
                'param4':['alist','of','strings']}
    return test_meta

def make_test_data(length_of_data):
    test_data = {'time' :(np.array(range(length_of_data))*1e-9), # to_list is necessary because json.dumps only takes native python types not numpy types
                 'vals' : np.random.randn(length_of_data),
                 'volts': np.random.randn(length_of_data), #dummy col names and values to fill columns
                 'dp'   : np.random.randn(length_of_data),
                 'dr'   : np.random.randn(length_of_data)}
    return test_data

def get_col_names():
    return ['time', 'vals', 'volts', 'dp', 'dr']
def round_4(val):
    return round(val, 4)


In [46]:
# NetCDF methods
def NC_make_meta_groupname(meta=None):
    if meta is None:
        test_meta = make_test_meta()
    metadata_as_string = "x"+ json.dumps(test_meta) # first character needs to a be letter 
    return metadata_as_string

def NC_write_n_to_file(length_of_data, n_to_write, comp = False, fp = None):
    
    if fp is None:
        fp = 'NetCDFTestFile1.nc'
        
    if os.path.exists(fp):
        os.unlink(fp)
    
    then = time_ns()

    with nc.Dataset(fp, "w", format="NETCDF4") as rootgrp:
        # Add global dimensions to be used in each group
        for col_name in get_col_names():
            rootgrp.createDimension(col_name, length_of_data)

        # Create n groups
        for i in range(n_to_write):
            # create group to hold values of each nth experiment
            testgrp = rootgrp.createGroup(NC_make_meta_groupname())

            test_data = make_test_data(length_of_data)
            # create variables
            for col_name in get_col_names():
                # name = col_name, datatype = f8(np.float64), size = rootgrp.dimensions['x'], also just (col_name) , compression='zlib'
#                 var = None
                if comp:
                    var = testgrp.createVariable(col_name, "f8", (rootgrp.dimensions[col_name],), compression='zlib')
                else:
                    var = testgrp.createVariable(col_name, "f8", (rootgrp.dimensions[col_name],))
                var[:] = test_data[col_name]  # fill in the data

    assert(rootgrp.isopen() == False)
    now = time_ns()
    file_size = os.path.getsize(fp)/1000 #to get in kb
    return (now-then) * 1e-9, file_size


def NC_load(source):
    then = time_ns()
#     rootgrp = nc.Dataset(source)
    
    with nc.Dataset(source) as rootgrp:
        # get meta data, stored as a dictionary of group names
        metadata = rootgrp.groups.keys()
        metadata_read = (time_ns() - then)*1e-9

        # get coldata
        # then = time_ns()
        print("no of groups/ns", len(rootgrp.groups.values()))
        for group in rootgrp.groups.values():
            time =  group.variables['time'][:]
            volts = group.variables['volts'][:]
            vals = group.variables['vals'][:]
            # print(time)

#         rootgrp.close()
    assert(rootgrp.isopen() == False)
    columns_read = (time_ns() - then)*1e-9 

    return metadata_read, columns_read
#     except:
#         rootgrp.close()
#     finally:
#          assert(rootgrp.isopen() == False)


In [50]:
# small test to check functionality
fp = 'NetCDFTestFile2.nc'
writetime, filesize = NC_write_n_to_file(2000, 1, True, fp = fp)
metadata_read, columns_read = NC_load(fp)

print(f'time to write: {round_4(writetime)} s, time to metaread {round_4(metadata_read)} s and colread is {round_4(columns_read)} s, size is {round_4(filesize)} KB')

no of groups/ns 1
time to write: 0.0053 s, time to metaread 0.0157 s and colread is 0.0157 s, size is 86.67 KB


In [278]:
# 2000 time to write: 0.0 s, time to metaread 0.0080321 s and colread is 0.0080321 s, size is 89.001 KB,
# 2000, time to write: 0.0249943 s, time to metaread 0.0 s and colread is 0.0 s, size is 86.697 KB 
# 1,000,000 time to write: 0.1846009 s, time to metaread 0.017372500000000003 s and colread is 0.0333759 s, size is 40009.0 KB
# 1000000, n = 1 time to write: 0.1747 s, time to metaread 0.0156 s and colread is 0.0313 s, size is 40009.001 KB
# 1000000, n = 1 time to write: 1.187 s, time to metaread 0.008 s and colread is 0.08 s, size is 28553.019 KB
# 1000000, n = 10, time to write: 1.6988159 s, time to metaread 0.0080033 s and colread is 0.13074 s, size is 400032.831 KB
# 1000000, n = 10, time to write: 11.009797 s, time to metaread 0.0081026 s and colread is 0.6249661000000001 s, size is 285467.403 KB

In [204]:
# Scratch work
def create_dimensions(grp, data_length ):
    for key in get_col_names():
        grp.createDimension(key, data_length)
    return
        
def create_sub_groups(parent_grp):
    test_meta = make_test_meta()
    metadata_as_string = "x"+ json.dumps(test_meta)# first character needs to a be letter 
    testgrp = parent_grp.createGroup(metadata_as_string)
    return testgrp

def create_vars(grp):
    for key in get_col_names():
        var =  grp.createVariable(key, "f8", (root.dimensions[key], ))
    return
        
def update_vars(grp, data_length):
    test_data= make_test_data(data_length)
    for var in grp.variables.values():
        var[:]  = test_data[var.name]
        
# open the data set
root = nc.Dataset("testNetCDF.nc", "w", format="NETCDF4")
print(root.data_model)
data_length = 2000
create_dimensions(root, data_length)
testgrp = create_sub_groups(root)
create_vars(testgrp)
update_vars(testgrp,data_length)
# root.close()
print("Is open ?", root.isopen())


NETCDF4
Is open ? True


In [205]:
# Printing functions

'''
Dimensions can be accesses by <grp_name>.dimensions, a dict. Once dimensions are created they cannot be deleted.
Dimensions at the root/parent group can be used in any subgroups. You can also have dimensions internal to a group
You can iterate using <grp_name>.dimensions.items(), key is a str of dim name and value is <class 'netCDF4._netCDF4.Dimension'>
It has attributes like val.name, val.size and you can make your own attributes
'''

def print_dim_names(grp):
    for key, val in grp.dimensions.items():
        print("Dimension name:", key) # or val.name
        print("Length", val.size)
    return
   
'''
Groups can be accessed by <grp_name>.groups, this is a dictionary. Groups cannot be deleted
To iterate use <grp_name>.groups.items(), the key = <group_name> (str) and value an object of type <class 'netCDF4._netCDF4.Group'>
The value has attributes like val.name, val.dimensions, val.variables, val.groups that can be accessed
You can set your own attributes, val.<attr_name> = <attr_value> and then retrieve them
You can also access groups with root.groups[<group_name>]
'''
def print_sub_groups(parent_grp):
    for key, val in parent_grp.groups.items():
        # print("key", key, type(key))
        print("Group name:", val.name)
        print("Group dimen are:", val.dimensions.keys())
        print("Group variables: ", val.variables.keys())
        print("Group subgroups: ", val.groups.keys())
        # print(val.ncattrs())
    return

def print_root_group_info(rootgrp):
    print("Root name:", rootgrp.name)
    print("Root dimen:", rootgrp.dimensions.keys())
    print("Root variables", rootgrp.variables.keys())
    print("Sub groups: ", rootgrp.groups.keys())
    
'''
Variables can be accessed by <grp_name>.variables, this is a dictionary. Once variables are created they can never be deleted
To iterate use <grp_name>.variables.items(), the key = <var_name> and value an object of type <class 'netCDF4._netCDF4.Variable'>
The value has attributes like val.name, val.size, val.shape, val.datatype that can be accessed
You can set your own attributes, val.<attr_name> = <attr_value> and then access it in a similar way print(val.<attr_name>)
You can see all the attributes you created using val.ncattrs() -> a list of all attributes
Attributes made can be deleted using del val.<attr_name>
The variables values are stored as masked arrays
'''
def print_var_names(grp):
    for key, val in grp.variables.items():
        print("key", key)
        print(" val is", val.name, val.size, val.shape, val.datatype)
        print(val.ncattrs())
    return

def print_var_vals(grp):
    for var in grp.variables.values():
        print(var.name)
        print(grp[var.name][:])
    return

# print_dim_names(root)
# print_sub_groups(root)
# print_root_group_info(root)
print_var_vals(testgrp)
# print_var_names(testgrp)
    

time
[0.000e+00 1.000e-09 2.000e-09 ... 1.997e-06 1.998e-06 1.999e-06]
vals
[0.17217752 1.09010898 1.01769319 ... 0.97805056 0.05291023 0.43305815]
volts
[-0.44248807  1.69635159 -0.99926819 ...  1.05278929 -0.31794579
  0.12904334]
dp
[ 0.5557416  -0.06853769 -0.52185739 ...  0.1011512   1.60889894
 -0.52819376]
dr
[-0.509609   -0.69344654 -0.30615196 ... -0.80484416  0.02577572
 -0.25333635]


In [206]:
root.close() 