In [1]:
import numpy as np
import xarray as xr

In [2]:
data = np.load('./Training.npz')
data

<numpy.lib.npyio.NpzFile at 0x7fd6ef57fc88>

# Metadata

In [3]:
input_names = data['arr_0']
input_names.size

88

In [4]:
input_names = input_names.astype(str) # byte string to unicode
input_names

array(['I', 'J', 'L', 'TEMP', 'PRESS', 'NUMDEN', 'H2O', 'JV1', 'JV2',
       'JV3', 'JV4', 'JV5', 'JV6', 'JV7', 'JV8', 'JV9', 'JV10', 'JV11',
       'JV12', 'NO', 'O3', 'PAN', 'CO', 'ALK4', 'ISOP', 'HNO3', 'H2O2',
       'ACET', 'MEK', 'ALD2', 'RCHO', 'MVK', 'MACR', 'PMN', 'PPN', 'R4N2',
       'PRPE', 'C3H8', 'CH2O', 'C2H6', 'N2O5', 'HNO4', 'MP', 'DMS', 'SO2',
       'SO4', 'SO4s', 'MSA', 'NH3', 'NH4', 'NIT', 'NITs', 'BCPI', 'OCPI',
       'BCPO', 'OCPO', 'DST1', 'DST2', 'DST3', 'DST4', 'SALA', 'SALC',
       'Br2', 'Br', 'BrO', 'HOBr', 'HBr', 'BrNO2', 'BrNO3', 'CHBr3',
       'CH2Br2', 'CH3Br', 'MPN', 'ISOPND', 'ISOPNB', 'MOBA', 'PROPNN',
       'HAC', 'GLYC', 'MVKN', 'MACRN', 'RIP', 'IEPOX', 'MAP', 'NO2',
       'NO3', 'HNO2', 'CH4'], dtype='<U6')

In [5]:
pred_names = data['arr_1']
pred_names.size

55

In [6]:
pred_names = pred_names.astype(str)
pred_names

array(['NO', 'O3', 'PAN', 'CO', 'ALK4', 'ISOP', 'HNO3', 'H2O2', 'ACET',
       'MEK', 'ALD2', 'RCHO', 'MVK', 'MACR', 'PMN', 'PPN', 'R4N2', 'PRPE',
       'C3H8', 'CH2O', 'C2H6', 'N2O5', 'HNO4', 'MP', 'DMS', 'SO2', 'SO4',
       'MSA', 'Br2', 'Br', 'BrO', 'HOBr', 'HBr', 'BrNO2', 'BrNO3',
       'CHBr3', 'CH2Br2', 'CH3Br', 'MPN', 'ISOPND', 'ISOPNB', 'MOBA',
       'PROPNN', 'HAC', 'GLYC', 'MVKN', 'MACRN', 'RIP', 'IEPOX', 'MAP',
       'NO2', 'NO3', 'HNO2', 'HO2', 'OH'], dtype='<U6')

# Numerical data

## Input

In [7]:
%%time
# this takes very long!!
mega_input = data['arr_2']

CPU times: user 46.9 s, sys: 51.2 s, total: 1min 38s
Wall time: 2min 17s


In [8]:
mega_input.shape, mega_input.nbytes / 1e9

((57852486, 88), 40.728150144)

In [10]:
dr_input = xr.DataArray(mega_input, name='data',
                        dims=['sample', 'var'],
                        coords={'var': input_names})
dr_input

<xarray.DataArray 'data' (sample: 57852486, var: 88)>
array([[1.0000e+00, 2.0000e+00, 2.0000e+00, ..., 5.7257e+02, 2.8759e+05,
        3.7944e+13],
       [1.0000e+00, 2.0000e+00, 6.0000e+00, ..., 1.7764e+03, 6.1624e+05,
        3.5697e+13],
       [1.0000e+00, 6.0000e+00, 8.0000e+00, ..., 1.4000e+04, 1.5113e+06,
        4.5101e+13],
       ...,
       [7.2000e+01, 4.5000e+01, 1.0000e+01, ..., 9.4561e+03, 3.2533e+05,
        4.2692e+13],
       [7.2000e+01, 4.5000e+01, 1.1000e+01, ..., 9.5895e+03, 3.5187e+05,
        4.1983e+13],
       [7.2000e+01, 4.6000e+01, 1.0000e+01, ..., 9.4617e+03, 3.2552e+05,
        4.2718e+13]])
Coordinates:
  * var      (var) <U6 'I' 'J' 'L' 'TEMP' 'PRESS' 'NUMDEN' 'H2O' 'JV1' 'JV2' ...
Dimensions without coordinates: sample

In [11]:
%%time
# saving to file also takes very long!!
dr_input.to_netcdf('./train_X.nc')

CPU times: user 8.1 s, sys: 20 s, total: 28.1 s
Wall time: 2min 35s


In [12]:
!ncdump -h train_X.nc

netcdf train_X {
dimensions:
	var = 88 ;
	sample = 57852486 ;
variables:
	string var(var) ;
	double data(sample, var) ;
		data :_FillValue = NaN ;
}


## Output

In [7]:
%%time
# this again takes very long!!
mega_pred = data['arr_3']

CPU times: user 27.7 s, sys: 10.4 s, total: 38.1 s
Wall time: 38.1 s


In [10]:
dr_pred = xr.DataArray(mega_pred, name='data',
                       dims=['sample', 'var'],
                       coords={'var': pred_names})
dr_pred

<xarray.DataArray 'data' (sample: 57852486, var: 55)>
array([[1.0000e+00, 5.9604e+11, 1.7449e+08, ..., 2.8782e+05, 2.8505e+04,
        3.7376e+01],
       [1.0000e+00, 5.8733e+11, 1.8214e+08, ..., 6.1685e+05, 2.7071e+04,
        3.9640e+01],
       [1.3892e+06, 6.9971e+11, 2.8033e+08, ..., 1.1089e+06, 5.7318e+06,
        1.5475e+04],
       ...,
       [7.8788e+07, 5.0154e+11, 3.5628e+08, ..., 4.3907e+05, 1.4725e+08,
        7.9132e+05],
       [8.2930e+07, 5.1686e+11, 4.0054e+08, ..., 4.7585e+05, 1.5148e+08,
        8.3062e+05],
       [6.8626e+07, 5.0194e+11, 3.5677e+08, ..., 3.7586e+05, 1.2275e+08,
        5.9364e+05]])
Coordinates:
  * var      (var) <U6 'NO' 'O3' 'PAN' 'CO' 'ALK4' 'ISOP' 'HNO3' 'H2O2' ...
Dimensions without coordinates: sample

In [11]:
%%time
# saving to file also takes very long!!
dr_pred.to_netcdf('./train_Y.nc')

CPU times: user 3.56 s, sys: 13.6 s, total: 17.2 s
Wall time: 51.7 s


In [12]:
!ncdump -h train_Y.nc

netcdf train_Y {
dimensions:
	var = 55 ;
	sample = 57852486 ;
variables:
	string var(var) ;
	double data(sample, var) ;
		data :_FillValue = NaN ;
}
