In [1]:
import pandas
import pandas as pd
import numpy
import os, glob, sys
import calendar

import multiprocessing
from functools import partial
import h5py

In [32]:
def datetime_to_seconds(dates, ref='1900-01-01T00:00:00'):
    """ from datetime64 to seconds since 1900-01-01 00:00:00"""
    return ((dates - numpy.datetime64(ref)) / numpy.timedelta64(1, 's')).astype(numpy.int64)

class TimeBatch:
    def __init__(self, year, month):
        self.year = year
        self.month = month

def read_nc_file(
        file: str, table_name: str, time_batch: TimeBatch = None
) -> pandas.DataFrame:
    """Read nc table using h5py."""
    
    sorted_by_variable = ['advanced_homogenisation', 'advanced_uncertainty', 'era5fb', 'observations_table', ]
    sorted_by_date = ['header_table', 'source_configuration']
    
    # check if file is available
    nc_file = glob.glob(file)
    if len(nc_file) != 1:
        raise FileNotFoundError
    
    # create time indices if time_batch is given
    if time_batch is not None:
        year = time_batch.year
        month = time_batch.month
        last_month_day = calendar.monthrange(year, month)[1]
        start = f"{year}-{month:02d}-01T00:00:00"
        end = f"{year}-{month:02d}-{last_month_day}T23:59:59"
        selected_start = datetime_to_seconds(numpy.datetime64(start))
        selected_end = datetime_to_seconds(numpy.datetime64(end))
    # select all if no time_batch is given
    else:
        start = "1900-01-01T00:00:00"
        end = "2999-01-01T00:00:00"
        selected_start = datetime_to_seconds(numpy.datetime64(start))
        selected_end = datetime_to_seconds(numpy.datetime64(end))
    
    # open file and select only data necessary to read
    with h5py.File(file) as hfile:
        var_dfs = {}
        
        # iterating through the variables and their indices in recordindices
        # store them in a dataframe dictionary for further use
        if table_name in sorted_by_variable:
            file_vars = numpy.array(hfile['recordindices'])
            time_index = hfile['recordindices']['recordtimestamp'][:]
            for variable in file_vars[(file_vars != 'index') & (file_vars != 'recordtimestamp')]:

                selector = (time_index >= selected_start) & (time_index <= selected_end)
                if selector[-1]:
                    selector = numpy.append(selector, True)
                else:
                    selector = numpy.append(selector, False)
                var_index = hfile['recordindices'][variable][:][selector]

                data = {}
                # selected table is read
                for i in numpy.array(hfile[table_name]):
                    # dropping string dims - not necessary for dataframes
                    if "string" not in str(i):
                        # recover byte array strings - not necessary for dataframes
                        if len(hfile[table_name][i].shape) > 1:
                            data[i] = hfile[table_name][i][var_index[0]:var_index[-1]].astype(object).sum(axis=1).astype(
                                str)
                        else:
                            data[i] = numpy.array(hfile[table_name][i][var_index[0]:var_index[-1]])
                var_dfs[variable] = (pd.DataFrame(data))
        
        elif table_name in sorted_by_date:
            time_index = hfile['header_table']['report_timestamp'][:]
            selector = (time_index >= selected_start) & (time_index <= selected_end)
            print(len(selector),selector)
            data = {}
            # selected table is read
            for i in numpy.array(hfile[table_name]):
                # dropping string dims - not necessary for dataframes
                if "string" not in str(i):
                    # recover byte array strings - not necessary for dataframes
                    if len(hfile[table_name][i].shape) > 1:
                        data[i] = hfile[table_name][i][:][selector].astype(object).sum(axis=1).astype(
                            str)
                    else:
                        data[i] = numpy.array(hfile[table_name][i][:][selector])
            var_dfs[table_name] = (pd.DataFrame(data))
            
        else:
            data = {}
            # selected table is read
            for i in numpy.array(hfile[table_name]):
                # dropping string dims - not necessary for dataframes
                if "string" not in str(i):
                    # recover byte array strings - not necessary for dataframes
                    if len(hfile[table_name][i].shape) > 1:
                        data[i] = hfile[table_name][i][:].astype(object).sum(axis=1).astype(
                            str)
                    else:
                        data[i] = numpy.array(hfile[table_name][i][:])
            var_dfs[table_name] = (pd.DataFrame(data))


    # create dataframe as needed, example: stack all the variables:
    keys = list(var_dfs.keys())
    df_out = var_dfs[keys[0]]
    for i in keys[1:]:
        df_out = df_out.append(var_dfs[i], ignore_index=True)

    return df_out

In [37]:
tb = TimeBatch(2000,1)
file = glob.glob('/mnt/users/scratch/leo/scratch/converted_v8/*11035*')[0]
table_name = 'observed_variable' # 'header_table' # 'observations_table'

df = read_nc_file(file, table_name, tb)
df

Unnamed: 0,description,domain,name,observed_variable_len,parameter_group,sub_domain,units,variable
0,Vertical column integral of spectral aerosol a...,,aerosol absorption optical depth,0.0,aerosols,,Dimensionless,0
1,2D field of the column burden of condensed par...,,aerosol column burden,0.0,aerosols,,g m-2,1
2,3-D field of concentration of dust or sand in ...,,aerosol dust concentration,0.0,aerosols,,g kg-1,2
3,"3D field of mean aerosol particle size, define...",,aerosol effective radius,0.0,aerosols,,micro m,3
4,3D field of spectral volumetric extinction cro...,,aerosol extinction coefficient,0.0,aerosols,,m-1,4
...,...,...,...,...,...,...,...,...
137,Level partial pressure of ozone in milli-Pasca...,atmospheric,ozone partial pressure,0.0,,upper-air,Pa,150
138,Level mixing ratio of ozone in ppmv,atmospheric,ozone concentration,0.0,,upper-air,ppmv,151
139,Ozone (DU) integrated up to the current altitu...,atmospheric,total ozone column,0.0,,upper-air,DU,152
140,Ozone amount integrated over the whole balloon...,atmospheric,flight summary integrated O3,0.0,,upper-air,DU,153


In [23]:
sys.path.append(os.getcwd()+'/../cds-backend/code/')
import cds_eua4 as eua

with eua.CDMDataset(file) as ofile:
    display(ofile)
    display(ofile.z_coordinate_type)
#     display(ofile.recordindices)
#     display(ofile.recordindices.recordtimestamp[-20:])
#     display(ofile.recordindices['39'][-20:])
    display(ofile.header_table)
    display(ofile.header_table.report_timestamp[-20:])
    display(ofile.load_variable_from_file(name='observed_variable', group='observations_table', return_data=True))

File: <HDF5 file "0-20001-0-11035_CEUAS_merged_v1.nc" (mode r)>
Filesize: 2465.82 MB
Filename: /mnt/users/scratch/leo/scratch/converted_v8/0-20001-0-11035_CEUAS_merged_v1.nc
(G)roups/(V)ariables: 

 - G | advanced_homogenisation______________________ : : 7
 - G | advanced_uncertainty_________________________ : : 9
 - G | crs__________________________________________ : : 4
 - V | dateindex____________________________________ : : (28103,)
 - G | era5fb_______________________________________ : : 72
 - G | header_table_________________________________ : : 56
 - G | observations_table___________________________ : : 50
 - G | observed_variable____________________________ : : 9
 - G | recordindices________________________________ : : 13
 - G | sensor_configuration_________________________ : : 12
 - G | source_configuration_________________________ : : 2
 - G | station_configuration________________________ : : 46
 - G | station_configuration_codes__________________ : : 7
 - G | station_type___

z_coordinate_type:

description_______________________________________ : : (2, 80)
string80__________________________________________ : : (80,)
type______________________________________________ : : (2,)
z_coordinate_type_len_____________________________ : : (2,)

header_table:

application_area__________________________________ : : (97514,)
crs_______________________________________________ : : (97514,)
duplicate_status__________________________________ : : (97514,)
duplicates________________________________________ : : (97514, 70)
events_at_station_________________________________ : : (97514,)
height_of_station_above_local_ground______________ : : (97514,)
height_of_station_above_sea_level_________________ : : (97514,)
height_of_station_above_sea_level_accuracy________ : : (97514,)
index_____________________________________________ : : (97514,)
instrument________________________________________ : : (97514, 20)
latitude__________________________________________ : : (97514,)
location_accuracy_________________________________ : : (97514,)
location_method___________________________________ : : (97514,)
location_quality__________________________________ : : (97514,)
longitude_________________________________________ : : (97514,)
number_of_pressure_

array([3849118240, 3849161784, 3849204640, 3849248079, 3849291056,
       3849334398, 3849377518, 3849420628, 3849463845, 3849507003,
       3849550217, 3849593454, 3849636666, 3849680440, 3849723123,
       3849766228, 3849809658, 3849852628, 3849895825, 3849939201])

[array([  0,   0,   0, ..., 140, 140, 140])]