In [1]:
import pandas as pd
import numpy as np
import pyreadr

## Create fake California data with gridding from Dalhousie

In [2]:
dh_2010 = pd.read_csv('./Data/cali_example/dh_ca_annual_2010.csv')

In [3]:
dh_2010.head()

Unnamed: 0,x,y,PM25
0,-124.405,42.005,
1,-124.395,42.005,
2,-124.385,42.005,
3,-124.375,42.005,
4,-124.365,42.005,


In [4]:
dh_2010 = dh_2010.dropna()

In [5]:
dh_2010 = dh_2010.rename(columns={"x": "lon", "y": "lat", "PM25": "pm25"})

In [6]:
dh_2010.head()

Unnamed: 0,lon,lat,pm25
17,-124.235,42.005,3.1
18,-124.225,42.005,3.1
19,-124.215,42.005,3.1
20,-124.205,42.005,3.0
21,-124.195,42.005,3.0


In [7]:
fake_data_1 = dh_2010.copy()
fake_data_2 = dh_2010.copy()
fake_data_3 = dh_2010.copy()

In [8]:
dh_2010['pm25'].min(), dh_2010['pm25'].max()

(2.79999995231628, 16.5)

In [9]:
np.random.seed(5578942)

#fake_pm25 = np.random.uniform(dh_2010['pm25'].min(), dh_2010['pm25'].max(), dh_2010.shape[0])
fake_pm25 = np.random.uniform(dh_2010['pm25'].min(), dh_2010['pm25'].max(), (3, dh_2010.shape[0]))

In [10]:
fake_pm25.shape

(3, 731615)

In [11]:
fake_data_1['pm25'] = fake_pm25[0]
fake_data_2['pm25'] = fake_pm25[1]
fake_data_3['pm25'] = fake_pm25[2]

In [12]:
fake_data_1.to_csv('./Data/cali_example/dhFake1_2010_align.csv')
fake_data_2.to_csv('./Data/cali_example/dhFake2_2010_align.csv')
fake_data_3.to_csv('./Data/cali_example/dhFake3_2010_align.csv')

## Create training data file

In [13]:
monitor_data = pyreadr.read_r('./Data/epa_data/pm25_observed_2000_2016.rds')
monitor_data = monitor_data[None]

In [14]:
monitor_data.head()

Unnamed: 0,Date,uid,source,pm25_obs,Latitude,Longitude,State.Code,County.Code
0,2000-01-16,10030010,EPA,5.9,30.497478,-87.880258,1,3
1,2000-01-19,10030010,EPA,11.0,30.497478,-87.880258,1,3
2,2000-01-22,10030010,EPA,8.6,30.497478,-87.880258,1,3
3,2000-01-25,10030010,EPA,6.2,30.497478,-87.880258,1,3
4,2000-01-28,10030010,EPA,7.6,30.497478,-87.880258,1,3


In [15]:
cali_monitors = monitor_data.loc[(pd.to_datetime(monitor_data['Date']).dt.year == 2010) & (monitor_data['State.Code'] == '06')]

In [16]:
cali_monitors.head()

Unnamed: 0,Date,uid,source,pm25_obs,Latitude,Longitude,State.Code,County.Code
1559010,2010-01-01,60010007,EPA,22.1,37.687526,-121.784217,6,1
1559011,2010-01-02,60010007,EPA,10.7,37.687526,-121.784217,6,1
1559012,2010-01-03,60010007,EPA,16.4,37.687526,-121.784217,6,1
1559013,2010-01-04,60010007,EPA,30.5,37.687526,-121.784217,6,1
1559014,2010-01-05,60010007,EPA,30.1,37.687526,-121.784217,6,1


In [17]:
cali_monitors = cali_monitors[['Longitude', 'Latitude', 'pm25_obs']]

In [18]:
cali_monitors = cali_monitors.rename(columns={"Longitude": "lon", "Latitude": "lat"})

In [19]:
training_data_2010 = cali_monitors.groupby(['lon','lat']).mean().reset_index()

In [20]:
training_data_2010

Unnamed: 0,lon,lat,pm25_obs
0,-124.179490,40.776780,5.983908
1,-124.162100,40.801780,5.824719
2,-124.083900,41.560800,2.835678
3,-122.922290,39.032700,3.075410
4,-122.908500,38.122400,5.137130
5,-122.804600,40.786400,2.069659
6,-122.633579,41.726892,4.501818
7,-122.403100,40.630400,6.252542
8,-122.401790,40.691840,3.915254
9,-122.380920,40.550130,4.630357


In [21]:
# round to nearest 0.05 
training_data_2010['approx_Long'] = round(training_data_2010.lon/0.01) * 0.01 - 0.005
training_data_2010['approx_Lat'] = round(training_data_2010.lat/0.01) * 0.01 - 0.005

In [22]:
training_data_2010.head()

Unnamed: 0,lon,lat,pm25_obs,approx_Long,approx_Lat
0,-124.17949,40.77678,5.983908,-124.185,40.775
1,-124.1621,40.80178,5.824719,-124.165,40.795
2,-124.0839,41.5608,2.835678,-124.085,41.555
3,-122.92229,39.0327,3.07541,-122.925,39.025
4,-122.9085,38.1224,5.13713,-122.915,38.115


In [23]:
training_data_2010.drop_duplicates(subset = ['approx_Long','approx_Lat'], 
                     keep = False, inplace = True) 

In [24]:
dh_2010 = dh_2010.rename(columns={"lon": "approx_Long", "lat": "approx_Lat", 'pm25' : 'pm25_dh'})
dh_2010.head()

Unnamed: 0,approx_Long,approx_Lat,pm25_dh
17,-124.235,42.005,3.1
18,-124.225,42.005,3.1
19,-124.215,42.005,3.1
20,-124.205,42.005,3.0
21,-124.195,42.005,3.0


In [29]:
joined_df = training_data_2010.merge(dh_2010, how='left', on=['approx_Long','approx_Lat'])

In [28]:
fake_data_1 = fake_data_1.rename(columns={"lon": "approx_Long", "lat": "approx_Lat", 'pm25' : 'pm25_fakeDH1'})
fake_data_2 = fake_data_2.rename(columns={"lon": "approx_Long", "lat": "approx_Lat", 'pm25' : 'pm25_fakeDH2'})
fake_data_3 = fake_data_3.rename(columns={"lon": "approx_Long", "lat": "approx_Lat", 'pm25' : 'pm25_fakeDH3'})

In [30]:
joined_df = joined_df.merge(fake_data_1, how='left', on=['approx_Long','approx_Lat'])
joined_df = joined_df.merge(fake_data_2, how='left', on=['approx_Long','approx_Lat'])
joined_df = joined_df.merge(fake_data_3, how='left', on=['approx_Long','approx_Lat'])

In [31]:
joined_df.head()

Unnamed: 0,lon,lat,pm25_obs,approx_Long,approx_Lat,pm25_dh,pm25_fakeDH1,pm25_fakeDH2,pm25_fakeDH3
0,-124.17949,40.77678,5.983908,-124.185,40.775,5.0,9.535996,10.144874,9.43002
1,-124.1621,40.80178,5.824719,-124.165,40.795,,,,
2,-124.0839,41.5608,2.835678,-124.085,41.555,3.0,11.952841,12.663872,15.7604
3,-122.92229,39.0327,3.07541,-122.925,39.025,3.6,15.326726,8.16682,6.183896
4,-122.9085,38.1224,5.13713,-122.915,38.115,,,,


In [None]:
# cmaq_1 = pd.read_csv('./Data/cali_example/CMAQ_2010_align.csv')
# cmaq_2 = pd.read_csv('./Data/cali_example/CMAQ1_2010_align.csv')

In [None]:
# cmaq_1 = cmaq_1.drop(['Unnamed: 0'], axis = 1)
# cmaq_2 = cmaq_2.drop(['Unnamed: 0'], axis = 1)

In [None]:
# cmaq_1 = cmaq_1.rename(columns={"lon": "approx_Long", "lat": "approx_Lat", 'pm25' : 'pm25_CMAQ'})
# cmaq_2 = cmaq_2.rename(columns={"lon": "approx_Long", "lat": "approx_Lat", 'pm25' : 'pm25_CMAQ1'})

In [None]:
# joined_df = joined_df.merge(cmaq_1, how='left', on=['approx_Long','approx_Lat'])
# joined_df = joined_df.merge(cmaq_2, how='left', on=['approx_Long','approx_Lat'])

In [None]:
# joined_df.head()

In [32]:
joined_df = joined_df.fillna(method = 'bfill', axis = 0)

In [33]:
joined_df = joined_df.drop(['approx_Long', 'approx_Lat'], axis = 1)
joined_df = joined_df.rename(columns={"pm25_dh": "pred_dh", "pm25_fakeDH2": "pred_dhFake2", 'pm25_fakeDH1' : 'pred_dhFake1', "pm25_fakeDH3" : 'pred_dhFake3'})

In [35]:
joined_df.to_csv('./Data/cali_example/training_data_2010.csv', index = False)

In [41]:
dh_2010 = dh_2010.rename(columns={'approx_Long' : 'lon', 'approx_Lat' : 'lat', 'pm25_dh' : 'pm25'})

In [43]:
dh_2010.to_csv('./Data/cali_example/dh_2010_align.csv')