NAME: beacon_processed

PURPOSE: Create processed data for dissemination that is averaged hourly and converted to concentration units

INPUTS/CHANGES REQUIRED: Change site names, site codes, start date and end date

CREATED: by Kaitlyn Lieschke on 13.02.18

In [1]:
import pandas as pd
import numpy as np
import csv
#import glob as glob << Need to do manually, as below
import os
from datetime import datetime, timedelta
from dateutil.parser import parse, tz
from dateutil.relativedelta import relativedelta

In [30]:
# List all sites to be processed
#sitenames = np.array(['hercules','ohlone','collins','steward','montalvin','sheldon',
#                     'middlecollege','richmondhs','peres','dejean','nystrom','madera',
#                     'portola','verde','fieldstation','washington'])
sitenames = np.array(['dejean','peres','richmondhs'])
sitecodes = np.array(['DEJ_','PER_','RHS_'])

# Set start and end dates for data to process
start_date = '2017-09-01' # format YYYY-MM-DD
start_datetime = datetime(2017,9,1)
end_date = '2017-10-01' #Day after last day of processed data, format YYYY-MM-DD
end_datetime = datetime(2017,10,1) #Day after last day of processed data

# Set species
spec = "PM(ug/m3)"
specnm = "%FS PM"

In [2]:
import sys
import os
import re
import fnmatch
try:
    _unicode = unicode
except NameError:
    # If Python is built without Unicode support, the unicode type
    # will not exist. Fake one.
    class _unicode(object):
        pass
__all__ = ["glob", "iglob"]
def glob(pathname):
    """Return a list of paths matching a pathname pattern.

    The pattern may contain simple shell-style wildcards a la
    fnmatch. However, unlike fnmatch, filenames starting with a
    dot are special cases that are not matched by '*' and '?'
    patterns.

    """
    return list(iglob(pathname))
def iglob(pathname):
    """Return an iterator which yields the paths matching a pathname pattern.

    The pattern may contain simple shell-style wildcards a la
    fnmatch. However, unlike fnmatch, filenames starting with a
    dot are special cases that are not matched by '*' and '?'
    patterns.

    """
    dirname, basename = os.path.split(pathname)
    if not has_magic(pathname):
        if basename:
            if os.path.lexists(pathname):
                yield pathname
        else:
            # Patterns ending with a slash should match only directories
            if os.path.isdir(dirname):
                yield pathname
        return
    if not dirname:
        for name in glob1(os.curdir, basename):
            yield name
        return
    # `os.path.split()` returns the argument itself as a dirname if it is a
    # drive or UNC path.  Prevent an infinite recursion if a drive or UNC path
    # contains magic characters (i.e. r'\\?\C:').
    if dirname != pathname and has_magic(dirname):
        dirs = iglob(dirname)
    else:
        dirs = [dirname]
    if has_magic(basename):
        glob_in_dir = glob1
    else:
        glob_in_dir = glob0
    for dirname in dirs:
        for name in glob_in_dir(dirname, basename):
            yield os.path.join(dirname, name)

# These 2 helper functions non-recursively glob inside a literal directory.
# They return a list of basenames. `glob1` accepts a pattern while `glob0`
# takes a literal basename (so it only has to check for its existence).

def glob1(dirname, pattern):
    if not dirname:
        dirname = os.curdir
    if isinstance(pattern, _unicode) and not isinstance(dirname, unicode):
        dirname = unicode(dirname, sys.getfilesystemencoding() or
                                   sys.getdefaultencoding())
    try:
        names = os.listdir(dirname)
    except os.error:
        return []
    if pattern[0] != '.':
        names = filter(lambda x: x[0] != '.', names)
    return fnmatch.filter(names, pattern)

def glob0(dirname, basename):
    if basename == '':
        # `os.path.split()` returns an empty basename for paths ending with a
        # directory separator.  'q*x/' should match only directories.
        if os.path.isdir(dirname):
            return [basename]
    else:
        if os.path.lexists(os.path.join(dirname, basename)):
            return [basename]
    return []


magic_check = re.compile('[*?[]')

def has_magic(s):
    return magic_check.search(s) is not None

In [19]:
#Create function to hourly average data
def hourly_avg(df,datecol,pmcol):
    df['datetime']=df[datecol].apply(lambda row: parse(row))
    df['day'] = df['datetime'].apply(lambda x: x.day)
    df['month'] = df['datetime'].apply(lambda x: x.month)
    df['hour']=df['datetime'].apply(lambda x: x.hour)
    df['year']=df['datetime'].apply(lambda x: x.year)
    df = df.drop('datetime', axis=1)
    df=df.apply(lambda x: pd.to_numeric(x, errors='coerce'))
    df_t = df.groupby(['day','month','hour','year'], as_index=False,sort=False)
    df_t = df_t.agg({pmcol:'mean'})
    df_t['UTC time'] = df_t[['year', 'month', 'day', 'hour']].apply(lambda s : datetime(*s),axis = 1)
    df_t = df_t[[pmcol,'UTC time']] 
    return df_t

In [4]:
#Create function to generate array of datetimes at a given interval
def date_range(start_datetime, end_datetime, increment, period):
    result = []
    nxt = start_datetime
    delta = relativedelta(**{period:increment})
    while nxt < end_datetime:
        result.append(nxt)
        nxt += delta
    return result

In [5]:
#Create function to produce local datetimes (GMT-7) from UTC datetimes
def local_datetime(df):
    # Create new dataframe to contain local times, with length of old dataframe
    local = pd.DataFrame(index=df.index.values, columns=['Time(GMT-7)'])
    # Loop through all dates in df and convert to local date in new dataframe
    for dateutc in df.index.values:
        # Set timezones:
        from_zone = tz.gettz('UTC')
        to_zone = tz.gettz('America/Los_Angeles')
        utc = df['Time(UTC)'][dateutc]
        # Tell the datetime object that it's in UTC time zone since datetime objects are 'naive' by default
        utc = utc.replace(tzinfo=from_zone)
        # Convert time zone
        local['Time(GMT-7)'][dateutc]=utc.astimezone(to_zone)

    finaldf = pd.concat([local,df],axis=1)
    return finaldf

In [6]:
# # List all sites to be processed
# #sitenames = np.array(['hercules','ohlone','collins','steward','montalvin','sheldon',
# #                     'middlecollege','richmondhs','peres','dejean','nystrom','madera',
# #                     'portola','verde','fieldstation','washington'])
# sitenames = np.array(['laney3','BAAQMD_sanpablo','ebmud3'])
# sitecodes = np.array(['LAN_','BSP_','EBM_'])

# # Set start and end dates for data to process
# start_date = '2017-01-01' # format YYYY-MM-DD
# start_datetime = datetime(2017,1,1)
# end_date = '2018-01-01' #Day after last day of processed data, format YYYY-MM-DD
# end_datetime = datetime(2018,1,1) #Day after last day of processed data

In [33]:
# Create headernames for all columns in file
headernames=["BMP temperature","pressure","SHT temperature","RH","dew point temp","MiCS O3","MiCS CO","MiCS NO2",
             "Alpha O3","Alpha O3","Alpha CO","Alpha CO","Alpha NO","Alpha NO","Alpha NO2","Alpha NO2","high PM",
             "lowPM","%FS PM","CO2","GMP temperature","UTC time"]

# Create array of months to iterate over when retrieving data and convert to string of YYYY_MM
monthrange = np.array(date_range(start_datetime, end_datetime, 1, 'months'))
for l in range(len(monthrange)):
    monthrange[l] = monthrange[l].strftime('%Y_%m')

# Create array of hourly dates
daterange = pd.DataFrame(np.array(date_range(start_datetime, end_datetime, 1, 'hours')))
daterange.columns = ["Time(UTC)"]

# Download, average and combine data from all sites
for x in range(len(sitenames)):
    
    #Create new column for site data in final dataframe populated with NaN
    daterange[sitecodes[x]+spec] = np.nan
    
    for l in range(len(monthrange)):
    
        # Download and concatenate all data from one site
        folder='/Users/kaitlynlieschke/Documents/'+sitenames[x]+'/data/'+monthrange[l]+'/'
        files=glob(os.path.join(folder,"*.csv"))
        if len(files) == 0:
            continue
        df=pd.concat((pd.read_csv(file,names=headernames,na_values='-999') for file in files), ignore_index=True)

        #Remove any lines where date is NaN, select only relevant columns and limit dates of data
        df=df[[specnm,'UTC time']] 
        df=df.dropna(axis=0, how='any')
        df=df[(df['UTC time'] >= start_date) & (df['UTC time'] <= end_date)].reset_index()
        #dflph=dflph.reset_index()   Try just adding this to the above line

        #Create hourly averages of data
        df = hourly_avg(df,'UTC time',specnm) #<< Calls hourly_avg function I created

        # Loop through data in datarange to identify dates where measurements occured.
        # Create counter (i) to follow rows in measurement dataframe df
        i = 0
        for j in range(len(daterange)):
            if daterange['Time(UTC)'].iloc[j] == df['UTC time'].iloc[i]:
                pmconc = (df[specnm].iloc[i] * 3.5)+5
                daterange.iloc[j, daterange.columns.get_loc(sitecodes[x]+spec)] = pmconc
                i =i+1
                if i == len(df):
                    break
                
# Add column of local datetimes to dataframe
finaldf = local_datetime(daterange)

In [35]:
finaldf.head()

Unnamed: 0,Time(GMT-7),Time(UTC),DEJ_CO2(ppm),PER_CO2(ppm),RHS_CO2(ppm)
0,2017-08-31 17:00:00-07:00,2017-09-01 00:00:00,392.412445,403.146885,328.844166
1,2017-08-31 18:00:00-07:00,2017-09-01 01:00:00,409.505246,419.886463,340.356441
2,2017-08-31 19:00:00-07:00,2017-09-01 02:00:00,425.784389,443.212773,359.549727
3,2017-08-31 20:00:00-07:00,2017-09-01 03:00:00,453.873064,460.28929,356.98623
4,2017-08-31 21:00:00-07:00,2017-09-01 04:00:00,473.23224,476.076856,371.37238


In [36]:
finaldf.to_csv(path_or_buf='SepPM.csv', sep=',', na_rep='NaN', index=False)