## MONET-Analysis Speciated PM prep notebook

### How to use

- start notebook and 
- in cell 2 set the start date and end date
- in cell 2 set the filename output (something like AERONET_L15_STARTDATE_ENDDATE.nc with STARTDATE and ENDDATE in YYYYMMDD format)

In [None]:
import monetio as mio
import numpy as np
import pandas as pd
import xarray as xr
from melodies_monet.util import write_util
import os

import warnings
warnings.filterwarnings('ignore')
# helper function for site ids
def get_siteid(df):
    try:
        df['siteid'] = df.state_code.astype(str).str.zfill(2) + df.county_code.astype(str).str.zfill(3) + df.site_num.astype(str).str.zfill(4) +  df.parameter_code.astype(str).str.zfill(5) + df.poc.astype(str)
        df['siteid2'] = df.state_code.astype(str).str.zfill(2) + df.county_code.astype(str).str.zfill(3) + df.site_num.astype(str).str.zfill(4)
    except:
        df['siteid'] = df.state_code.astype(str).str.zfill(2) + df.county_code.astype(str).str.zfill(3) + df.site_number.astype(str).str.zfill(4) + df.parameter_code.astype(str).str.zfill(5) + df.poc.astype(str)
        df['siteid2'] = df.state_code.astype(str).str.zfill(2) + df.county_code.astype(str).str.zfill(3) + df.site_number.astype(str).str.zfill(4)
    return df

#helper function to get the site data 
def read_site_df(url='https://aqs.epa.gov/aqsweb/airdata/aqs_monitors.zip'):
    sitedf = pd.read_csv(url)
    sitedf.columns = [ i.lower().replace(' ','_') for i in sitedf.columns]
    sitedf = get_siteid(sitedf)
    drop_vars = ['state_code', 'county_code', 'site_number',
       'reporting_agency', 'pqao', 'collecting_agency', 'exclusions',
       'monitoring_objective', 'last_method_code', 'last_method',
       'measurement_scale', 'measurement_scale_definition',
       'naaqs_primary_monitor', 'qa_primary_monitor', 'local_site_name',
       'address', 'county_name', 'city_name', 
       'tribe_name', 'extraction_date','datum',
       'first_year_of_data', 'last_sample_date' ,'parameter_name','poc']
    sitedf = sitedf.drop(drop_vars,axis=1)
    return sitedf

In [None]:
# set the dates
dates = pd.date_range(start='2019-08-01',end='2019-08-31',freq='H') # note this just get the start year for these 

#SET NETWORK
network = 'NCORE' # CSN NCORE CASTNET IMPROVE

# set the output filename
outname = '{}_DAILY_2019.nc'.format(network)

# add the data
df = mio.aqs.add_data(dates,param=['PM10SPEC', 'SPEC'], wide_fmt=False, daily=True)

# add siteids (this is broken in monetio currently...need to fix)
df = get_siteid(df)

#drop any data with nans 
df['obs'][df.obs <= 0] = np.nan
df = df.dropna(subset=['obs']).dropna(subset=['latitude','longitude'])

In [None]:
# add the site data 
sitedf = read_site_df() # pd.read_csv('https://aqs.epa.gov/aqsweb/airdata/aqs_monitors.zip')
sitedf['networks'] = sitedf.networks.fillna('UNSPECIFIED')
site_network = sitedf.loc[sitedf.networks.str.contains(network)].drop(['parameter_code','siteid'],axis=1)
sn = site_network.rename({'siteid2':'x'},axis=1).drop_duplicates(subset=['x'])

In [None]:
# drop the sites not in the current network
dff = df.loc[df.siteid2.isin(site_network.siteid2)].rename({'time_local':'time','siteid2':'x'}, axis=1).pivot_table(values='obs',index=['time','x'], columns=['variable'])
dfp = dff.groupby(level=dff.index.names).mean().to_xarray()

In [None]:
# now drop non-active sites for this network for the given year
sn = sn.loc[sn.x.isin(dff.reset_index().x)].drop_duplicates(subset=['x'])
# convert to xarray 
sn = sn.set_index(['x']).to_xarray()

In [None]:
# join data and site xarray objects 
output = xr.merge([dfp,sn],join='inner')
#expand dims for (time,y,x)
output = output.expand_dims('y').transpose('time','y','x')

In [None]:
#write the file 
print(os.path.join('data',outname))
d = {}
for i in output.data_vars:
    d[i] = output[i].name.replace(' ','_')
output.rename(d)
write_util.write_ncf(output,outname)# os.path.join('data',outname))
# write_util.write_ncf(t,filename)