In [1]:
# Import modules
# ipython magic to plot in line
%matplotlib inline
#import mpld3
#mpld3.enable_notebook()
import matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import xarray as xr
from astropy.io import ascii
import pytz
# OS interaction
import sys
import os
import glob

In [2]:
# http://stackoverflow.com/questions/38987/how-can-i-merge-two-python-dictionaries-in-a-single-expression
def merge_two_dicts(x, y):
    '''Given two dicts, merge them into a new dict as a shallow copy.'''
    z = x.copy()
    z.update(y)
    return z

In [26]:
# Directories
# Path to raw data
out_dir    = os.path.normpath(r'F:\Work\e\Data\Obs\Canada_Project_Sites\Nov_2014_snow_storm_data')
main_dir   = os.path.normpath(r'C:\Users\new356\Google Drive\Nov2014 Data QC Completed Data')
# MetaDataFile
meta_data_file  = os.path.join(main_dir,'CRHO_Station_lat_long_elevation.txt')
# Ascii input folder
#dir_in     = main_dir + '\QC_ascii'
dir_in     = main_dir + '\QC_Data_ASCII_slim'
# netcdf output folder
dir_out    = out_dir + '\\QC_netcdf'
cfileout   = os.path.join(dir_out,'CRHO.nc')

In [4]:
# Define input format of ascii files
input_format = 'CRHO_TELM'

In [5]:
if input_format == 'CRHO_TELM':
    # Ascii data format info
    c_header = 4 # Header lines
    c_column_line = 1 # line where column names start
    c_delimiter = ','
    # time zone variables
    #tz_in = pytz.timezone('Canada/Mountain')
    tz_in = pytz.timezone('Etc/GMT-6')

In [6]:
# Get file in info
#os.chdir(dir_in) # Move to input
#content = os.listdir(os.getcwd()) # Get list of files
#num_files = len([name for name in os.listdir('.') if os.path.isfile(name)]) # Get number of files in

os.chdir(dir_in) # Move to input
content = glob.glob('*.txt') # Get list of files
num_files = len(content)

In [7]:
# Read in metadata (all stations)
metadata = pd.read_csv(meta_data_file,index_col='station')

In [8]:
content

['BNS_15min_2013_2016_slim.txt',
 'BRP_15min_2014_2016_slim.txt',
 'BWH_15min_2014_slim.txt',
 'CRG_15min_2014_slim.txt',
 'CRN_15min_2013_2016_slim.txt',
 'FLG_15min_2014_2016_slim.txt',
 'FRG_15min_2014_slim.txt',
 'FRS_15min_2013_2016_slim.txt',
 'HLN_15min_2014_2015_slim.txt',
 'PWL_15min_2013_2016_slim.txt',
 'PYT_15min_2013_2016_slim.txt',
 'SIB_15min_2014_slim.txt']

In [9]:
# Initalize stuff
c_dict = {}
stations_all=[]
variables=[]
units_all = {}
time_index = {}

In [10]:
#datain = pd.read_csv(content[4],header=c_column_line,dtype={'TIMESTAMP': datetime}) #,mangle_dupe_cols=False)
#datain.drop(datain.index[:2], inplace=True)
#datain = datain[datain.columns.drop(datain.filter(regex='.1'))]

In [11]:
#datain.filter(regex='.1').columns

In [12]:
#dat = ascii.read(content[4],header_start=c_column_line,data_start=c_header,delimiter=c_delimiter,exclude_names=datain.filter(regex='.1').columns)
#datain = pd.DataFrame(dat.as_array())

In [13]:
#datain2

In [14]:
# Read in each file
for cfile in content:
    
    # Get current station name
    csta_name = cfile[0:3] # Take the first three letter abbreviation as the name
    print('Processing ' + csta_name)
    stations_all.append(csta_name)
    
    # Import data to pandas dataframe
    dat = ascii.read(cfile,header_start=c_column_line,data_start=c_header,delimiter=c_delimiter,exclude_names='N/A')
    datain = pd.DataFrame(dat.as_array())
    
    # Alternate method that drops duplicate (second variables) (Not correct but works for now)
    #print("Using temp fix to process dataframes with duplicate columns. DATA NOT CORRECT!!!!")
    #datain = pd.read_csv(content[4],header=c_column_line) #,mangle_dupe_cols=False)
    #datain.drop(datain.index[:2], inplace=True)
    #datain = datain[datain.columns.drop(datain.filter(regex='.1'))]
    #datain.columns
    
    
    # Replace -9999 with nan (recomended by netcdf)
    datain.replace(-9999,np.NaN,inplace=True)
    
    # Make TIMESTAMP the index
    datain['TIMESTAMP'] = datain['TIMESTAMP'].astype('datetime64[ns]')
    datain = datain.set_index('TIMESTAMP')
    
    # Set time zone
    datain.index = datain.index.tz_localize(tz_in)
    
    # Import header info 
    headerinfo = pd.read_csv(cfile,nrows=2,skiprows=1)
    units = headerinfo.loc[0,:].tolist() # Grab first row of dataframe (units)
    units = units[1:] # Remove first value which is the units of the timestamp
    units_dic = dict(zip(datain.columns,units)) # Dictionary of variable:units for this stations
    units_all = merge_two_dicts(units_all, units_dic) # Merge dictoinaries together (units_dic overwrites any units_all)
    
    # Loop through all variables for this station
    c_variables = datain.columns
    variables.extend(c_variables.values) # Store all variables for use latter
    for c_var in c_variables:
        c_dict[(csta_name,c_var)]        =pd.DataFrame(datain[c_var])
        c_dict[(csta_name,c_var)].columns=[c_var]
        c_dict[(csta_name,c_var)].index  = datain.index
        
    # Save time index for each station (need to fill in missing variables later)
    time_index[csta_name] = datain.index

Processing BNS
Processing BRP
Processing BWH
Processing CRG
Processing CRN
Processing FLG
Processing FRG
Processing FRS
Processing HLN
Processing PWL
Processing PYT
Processing SIB


In [15]:
# Get unique variables from list variables
variables_uniq = set(variables)

In [16]:
# Extract data for each variable from the dictionary and create a xray.Dataset

ds_list = [] # Initalize list of xray Datasets (each a different variable)

# For each unique variable in the dictionary
for c_var in variables_uniq:
    print(c_var)
    all_vars={} # Initialize dictionary that only contains one variable for all stations
    # For each station
    for c_sta in stations_all:
        # Test if this varible was measured at this station
        if ((c_sta,c_var) in c_dict):
            all_vars[c_sta] = c_dict[(c_sta,c_var)]
        else: # Variable doesn't exists at this station so pad it with -9999 (needed to merge into one netcdf file)
            index_csta = time_index[c_sta]
            df_missing = pd.DataFrame(index=index_csta, columns=[c_var])
            #df_missing = df_missing.fillna(-9999)
            all_vars[c_sta] = df_missing

    # Concatenate each variable by stations
    c_obs_all = pd.concat(all_vars,axis=0,keys=stations_all)
    #c_obs_all = pd.DataFrame(c_obs_all) # not needed
    
    # Convert to xray and add to list
    ds = xr.Dataset.from_dataframe(c_obs_all)
    # Add to list and rename variables
    ds_list.append(ds.rename({'level_0':'station','TIMESTAMP':'time'}))  

Soil Temperature C
Soil Moisture B
Wind Direction at A
Incremental Precipitation B
Soil Temperature B
Snow Depth QC value
Scalar Wind Speed B
Downward Solar Radiation
Scalar Wind Speed A
Downward Terrestrial Rad
Soil Temperature A
Soil Moisture C
Soil Moisture A
Snow Layer Temperature A
Soil Moisture E
Soil Temperature D
Soil Moisture D
Total Pressure Adjusted to Sea-level
Incremental Precipitation A
Soil Heat Flux  A
Upward Solar Radiation
Air temperature A
Snow Depth A
Surface Temperature B
Air Moisture Content A
Upward Terrestrial Rad
Snow Water Equivelent A
Surface Temperature A
Soil Temperature E
Total Pressure Unadjusted A


In [17]:
# Combine all variable Datasets using xray.update()
ds_all = xr.Dataset()
[ds_all.update(c_ds) for c_ds in ds_list]
ds_all

<xarray.Dataset>
Dimensions:                               (station: 12, time: 131425)
Coordinates:
  * station                               (station) object 'BNS' 'BRP' 'BWH' ...
  * time                                  (time) object 1349028000000000000 ...
Data variables:
    Soil Temperature C                    (station, time) float64 nan nan ...
    Soil Moisture B                       (station, time) float64 nan nan ...
    Wind Direction at A                   (station, time) float64 nan nan ...
    Incremental Precipitation B           (station, time) float64 nan nan ...
    Soil Temperature B                    (station, time) float64 nan nan ...
    Snow Depth QC value                   (station, time) object nan nan nan ...
    Scalar Wind Speed B                   (station, time) float64 nan nan ...
    Downward Solar Radiation              (station, time) float64 nan nan ...
    Scalar Wind Speed A                   (station, time) float64 nan nan ...
    Downward Terre

In [18]:
# Add variable attributes (units), and fix variable names (remove spaces)
for cvar in ds_all.data_vars:
    # add units as attributes
    ds_all.get(cvar).attrs['unit']   = units_all[cvar]
    # Remove spaces in variable names
    ds_all.rename({cvar:cvar.replace(" ","")},inplace=True)

In [19]:
# Tell xray TIMESTAMP is a datetime (it forgets for some reason)
ds_all['time'] = pd.to_datetime(ds_all.time)

In [20]:
# Ensure it is written in correct local time zone (TODO)

In [21]:
metadata

Unnamed: 0_level_0,lat,long,elevation(m)
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BNS,99,99,2099
PWL,99,99,2100
FRS,99,99,2330
CRN,99,99,2205
BRP,99,99,9999
FLG,99,99,9999
PYT,99,99,9999
BWH,99,99,9999
CRG,99,99,9999
FRG,99,99,9999


In [22]:
# Add meta data for each station
ds_all = ds_all.merge({'Elevation': ('station',metadata[' elevation(m)'])})
ds_all = ds_all.merge({'Lat': ('station',metadata[' lat'])})
ds_all = ds_all.merge({'Lon': ('station',metadata[' long'])})

In [23]:
# Make meta data coordiates from variables
ds_all.set_coords(['Elevation','Lat','Lon'], inplace=False)

<xarray.Dataset>
Dimensions:                           (station: 12, time: 131425)
Coordinates:
  * station                           (station) object 'BNS' 'BRP' 'BWH' ...
  * time                              (time) datetime64[ns] 2012-09-30T18:00:00 ...
    Elevation                         (station) int64 2099 2100 2330 2205 ...
    Lat                               (station) int64 99 99 99 99 99 99 99 ...
    Lon                               (station) int64 99 99 99 99 99 99 99 ...
Data variables:
    SoilTemperatureC                  (station, time) float64 nan nan nan ...
    SoilMoistureB                     (station, time) float64 nan nan nan ...
    WindDirectionatA                  (station, time) float64 nan nan nan ...
    IncrementalPrecipitationB         (station, time) float64 nan nan nan ...
    SoilTemperatureB                  (station, time) float64 nan nan nan ...
    SnowDepthQCvalue                  (station, time) object nan nan nan nan ...
    ScalarWindSpeedB

In [28]:
# Export to netcdf
ds_all.to_netcdf(cfileout,format='netcdf4') 