In [1]:
# Import modules
# ipython magic to plot in line
%matplotlib inline
#import mpld3
#mpld3.enable_notebook()
import matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xarray as xr
from astropy.io import ascii
import pytz
# OS interaction
import sys
import os

In [2]:
# http://stackoverflow.com/questions/38987/how-can-i-merge-two-python-dictionaries-in-a-single-expression
def merge_two_dicts(x, y):
    '''Given two dicts, merge them into a new dict as a shallow copy.'''
    z = x.copy()
    z.update(y)
    return z

In [3]:
# Directories
# Path to raw data
main_dir   = os.path.normpath(r'F:\Work\e\Data\Obs\Canada_Project_Sites\Nov_2014_snow_storm_data')
# Ascii input folder
dir_in     = main_dir + '\QC_ascii'
# netcdf output folder
dir_out    = main_dir + '\\QC_netcdf'
cfileout   = os.path.join(dir_out,'CRHO.nc')

In [4]:
# Define input format of ascii files
input_format = 'CRHO_TELM'

In [5]:
if input_format == 'CRHO_TELM':
    # Ascii data format info
    c_header = 4 # Header lines
    c_column_line = 1 # line where column names start
    c_delimiter = ','
    # time zone variables
    #tz_in = pytz.timezone('Canada/Mountain')
    tz_in = pytz.timezone('Etc/GMT-6')

In [6]:
# Get file in info
os.chdir(dir_in) # Move to input
content = os.listdir(os.getcwd()) # Get list of files
num_files = len([name for name in os.listdir('.') if os.path.isfile(name)]) # Get number of files in

In [7]:
# Initalize stuff
c_dict = {}
stations_all=[]
variables=[]
units_all = {}
time_index = {}

In [8]:
# Read in each file
for cfile in content:
    # Get current station name
    csta_name = cfile[0:3] # Take the first three letter abbreviation as the name
    print('Processing ' + csta_name)
    stations_all.append(csta_name)
    # Import data to pandas dataframe
    dat = ascii.read(cfile,header_start=c_column_line,data_start=c_header,delimiter=c_delimiter,exclude_names='N/A')
    datain = pd.DataFrame(dat.as_array())
    
    # Replace -9999 with nan (recomended by netcdf)
    datain.replace(-9999,np.NaN,inplace=True)
    
    # Make TIMESTAMP the index
    datain['TIMESTAMP'] = datain['TIMESTAMP'].astype('datetime64[ns]')
    datain = datain.set_index('TIMESTAMP')
    
    # Set time zone
    datain.index = datain.index.tz_localize(tz_in)
    
    # Import header info 
    headerinfo = pd.read_csv(cfile,nrows=2,skiprows=1)
    units = headerinfo.loc[0,:].tolist() # Grab first row of dataframe (units)
    units = units[1:] # Remove first value which is the units of the timestamp
    units_dic = dict(zip(datain.columns,units)) # Dictionary of variable:units for this stations
    units_all = merge_two_dicts(units_all, units_dic) # Merge dictoinaries together (units_dic overwrites any units_all)
    
    # Loop through all variables for this station
    c_variables = datain.columns
    variables.extend(c_variables.values) # Store all variables for use latter
    for c_var in c_variables:
        c_dict[(csta_name,c_var)]        =pd.DataFrame(datain[c_var])
        c_dict[(csta_name,c_var)].columns=[c_var]
        c_dict[(csta_name,c_var)].index  = datain.index
        
    # Save time index for each station (need to fill in missing variables later)
    time_index[csta_name] = datain.index

Processing BNS
Processing CRN
Processing FRS
Processing PWL


In [9]:
# Get unique variables from list variables
variables_uniq = set(variables)

In [10]:
# Extract data for each variable from the dictionary and create a xray.Dataset

ds_list = [] # Initalize list of xray Datasets (each a different variable)

# For each unique variable in the dictionary
for c_var in variables_uniq:
    print(c_var)
    all_vars={} # Initialize dictionary that only contains one variable for all stations
    # For each station
    for c_sta in stations_all:
        # Test if this varible was measured at this station
        if ((c_sta,c_var) in c_dict):
            all_vars[c_sta] = c_dict[(c_sta,c_var)]
        else: # Variable doesn't exists at this station so pad it with -9999 (needed to merge into one netcdf file)
            index_csta = time_index[c_sta]
            df_missing = pd.DataFrame(index=index_csta, columns=[c_var])
            #df_missing = df_missing.fillna(-9999)
            all_vars[c_sta] = df_missing

    # Concatenate each variable by stations
    c_obs_all = pd.concat(all_vars,axis=0,keys=stations_all)
    #c_obs_all = pd.DataFrame(c_obs_all) # not needed
    
    # Convert to xray and add to list
    ds = xr.Dataset.from_dataframe(c_obs_all)
    # Add to list and rename variables
    ds_list.append(ds.rename({'level_0':'station','TIMESTAMP':'time'}))  

Snow Water Equivelent A
Net Radiation
Total Pressure unadjusted A
Total Pressure adjusted to sea-level
Soil Temperature C
Incremental Precipitation A
Soil Temperature D
Air Moisture Content A
Soil Temperature B
Incremental Precipitation B
Soil Temperature A
Albedo
Scalar Wind Speed A
Soil Moisture A
Upward Solar Radiation
Downward Solar Radiation
Air temperature A
Upward Terrestrial Rad
Snow Depth QC value
Soil Temperature E
Soil Moisture C
Scalar Wind Speed B
Soil Heat Flux  A
Wind Direction at A
Soil Moisture E
Soil Moisture B
Downward Terrestrial Rad
Snow Depth A
Soil Moisture D


In [11]:
# Combine all variable Datasets using xray.update()
ds_all = xr.Dataset()
[ds_all.update(c_ds) for c_ds in ds_list]
ds_all

<xarray.Dataset>
Dimensions:                               (station: 4, time: 131425)
Coordinates:
  * station                               (station) object 'BNS' 'CRN' 'FRS' ...
  * time                                  (time) object 1349028000000000000 ...
Data variables:
    Snow Water Equivelent A               (station, time) float64 nan nan ...
    Net Radiation                         (station, time) float64 nan nan ...
    Total Pressure unadjusted A           (station, time) float64 nan nan ...
    Total Pressure adjusted to sea-level  (station, time) float64 nan nan ...
    Soil Temperature C                    (station, time) float64 nan nan ...
    Incremental Precipitation A           (station, time) float64 nan nan ...
    Soil Temperature D                    (station, time) float64 nan nan ...
    Air Moisture Content A                (station, time) float64 nan nan ...
    Soil Temperature B                    (station, time) float64 nan nan ...
    Incremental Precip

In [12]:
# Add variable attributes (units), and fix variable names (remove spaces)
for cvar in ds_all.data_vars:
    # add units as attributes
    ds_all.get(cvar).attrs['unit']   = units_all[cvar]
    # Remove spaces in variable names
    ds_all.rename({cvar:cvar.replace(" ","")},inplace=True)

In [13]:
# Tell xray TIMESTAMP is a datetime (it forgets for some reason)
ds_all['time'] = pd.to_datetime(ds_all.time)

In [14]:
ds_all

<xarray.Dataset>
Dimensions:                           (station: 4, time: 131425)
Coordinates:
  * station                           (station) object 'BNS' 'CRN' 'FRS' 'PWL'
  * time                              (time) datetime64[ns] 2012-09-30T18:00:00 ...
Data variables:
    SnowWaterEquivelentA              (station, time) float64 nan nan nan ...
    NetRadiation                      (station, time) float64 nan nan nan ...
    TotalPressureunadjustedA          (station, time) float64 nan nan nan ...
    TotalPressureadjustedtosea-level  (station, time) float64 nan nan nan ...
    SoilTemperatureC                  (station, time) float64 nan nan nan ...
    IncrementalPrecipitationA         (station, time) float64 nan nan nan ...
    SoilTemperatureD                  (station, time) float64 nan nan nan ...
    AirMoistureContentA               (station, time) float64 nan nan nan ...
    SoilTemperatureB                  (station, time) float64 nan nan nan ...
    IncrementalPrecipita

In [15]:
# Export to netcdf
ds_all.to_netcdf(cfileout,format='netcdf4') 