## This script downloads hourly airquality data from DATABC's ftp server and agregates it by month and year

In [4]:
import pandas as pd
import numpy as np
import urllib.request 
import os

Data_Path = "ftp://ftp.env.gov.bc.ca/pub/outgoing/AIR/AnnualSummary/"

## 2009 is the first year the data is aggregated yearly - The 2021 data is not available yet
Last_Year = 2020
Years = [str(x) for x in range(2009,Last_Year+1)]
## The gas/particulate data available 
Species = ["O3","PM25"] # Other Species Avaialble - "CO","H2S","NO","NO2","PM10","SO2","TRS"

def ReadFile(Data = None):
    if Data is None:
        Data = pd.read_csv('Temp.csv',parse_dates={'datetime':[0]},index_col=['datetime'],
                              dtype={"DATE_PST":str,"STATION_NAME":str,"EMS_ID":str,"PARAMETER":str,
                                     "INSTRUMENT":str,"RAW_VALUE":float,"UNIT":str,"ROUNDED_VALUE":float})
        Data['Month']=Data.index.month
        Data['Year']=Data.index.year
    else:
        NewData = pd.read_csv('Temp.csv',parse_dates={'datetime':[0]},index_col=['datetime'],
                              dtype={"DATE_PST":str,"STATION_NAME":str,"EMS_ID":str,"PARAMETER":str,
                                     "INSTRUMENT":str,"RAW_VALUE":float,"UNIT":str,"ROUNDED_VALUE":float})
        NewData['Month']=NewData.index.month
        NewData['Year']=NewData.index.year
        Data = Data.append(NewData)
    return(Data)

for species in Species:
    print('Downloading '+species)
    Data = None    
    for year in Years:
        path = Data_Path+year+'/'+species+'.csv'
        urllib.request.urlretrieve(path, 'Temp.csv')
        Data = ReadFile(Data)
    # The datafiles for last year contain a few observations for Jan of the current year, we need to remove those!
    Data = Data.loc[Data['Year']<=Last_Year].copy()
    
    # Some stations have lots of missing data.  We only want to keep the ones that have at least 50% coverage.
    Keep = Data.groupby('EMS_ID').count()['STATION_NAME']
    Keep = Keep[Keep>Keep.max()*.5].index
    Data = Data.loc[Data['EMS_ID'].isin(Keep)].copy()
    
    # Agregate data by year and write to a file
    AggData = Data.groupby(['EMS_ID','Year']).agg({'RAW_VALUE':'mean'})
    AggData.unstack()['RAW_VALUE'].to_csv(species+'_Yearly_Averages.csv')
    
    Data['Year_Month'] = Data['Year']*100+Data['Month']
    AggData = Data.groupby(['EMS_ID','Year_Month']).agg({'RAW_VALUE':'mean'})
    AggData.unstack()['RAW_VALUE'].to_csv(species+'_Timeseries.csv')
    print('Completed '+species,': there are ',AggData.unstack().shape[0],' stations available to analyze.')
    print()
os.remove('Temp.csv')
print('Done!!')

Downloading O3
Completed O3 : there are  40  stations available to analyze.

Downloading PM25
Completed PM25 : there are  45  stations available to analyze.

Done!!


In [None]:
Sites = pd.read_csv('ftp://ftp.env.gov.bc.ca/pub/outgoing/AIR/Air_Monitoring_Stations/bc_air_monitoring_stations.csv')
Sites.to_csv('MonitoringStations.csv')
Sites