In [1]:
import pandas as pd
import numpy as np
import pyreadr

## Create fake California data with gridding from Dalhousie

In [2]:
dh_2010 = pd.read_csv('./Data/cali_example/dh_ca_annual_2010.csv')

In [3]:
dh_2010.head()

Unnamed: 0,x,y,PM25
0,-124.405,42.005,
1,-124.395,42.005,
2,-124.385,42.005,
3,-124.375,42.005,
4,-124.365,42.005,


In [4]:
dh_2010 = dh_2010.dropna()

In [5]:
dh_2010 = dh_2010.rename(columns={"x": "lon", "y": "lat", "PM25": "pm25"})

In [6]:
dh_2010.head()

Unnamed: 0,lon,lat,pm25
17,-124.235,42.005,3.1
18,-124.225,42.005,3.1
19,-124.215,42.005,3.1
20,-124.205,42.005,3.0
21,-124.195,42.005,3.0


In [7]:
fake_data_1 = dh_2010.copy()
fake_data_2 = dh_2010.copy()
fake_data_3 = dh_2010.copy()

In [8]:
dh_2010['pm25'].min(), dh_2010['pm25'].max()

(2.79999995231628, 16.5)

In [9]:
np.random.seed(5578942)

#fake_pm25 = np.random.uniform(dh_2010['pm25'].min(), dh_2010['pm25'].max(), dh_2010.shape[0])
fake_pm25 = np.random.uniform(dh_2010['pm25'].min(), dh_2010['pm25'].max(), (3, dh_2010.shape[0]))

In [10]:
fake_pm25.shape

(3, 731615)

In [11]:
fake_data_1['pm25'] = fake_pm25[0]
fake_data_2['pm25'] = fake_pm25[1]
fake_data_3['pm25'] = fake_pm25[2]

In [12]:
fake_data_1.to_csv('./Data/cali_example/dhFake1_2010_align.csv')
fake_data_2.to_csv('./Data/cali_example/dhFake2_2010_align.csv')
fake_data_3.to_csv('./Data/cali_example/dhFake3_2010_align.csv')

## Create training data file

In [None]:
fake_data_1 = pd.read_csv('./Data/cali_example/dhFake1_2010_align.csv')
fake_data_2 = pd.read_csv('./Data/cali_example/dhFake2_2010_align.csv')
fake_data_3 = pd.read_csv('./Data/cali_example/dhFake3_2010_align.csv')

In [13]:
monitor_data = pyreadr.read_r('./Data/epa_data/pm25_observed_2000_2016.rds')
monitor_data = monitor_data[None]

In [14]:
monitor_data.head()

Unnamed: 0,Date,uid,source,pm25_obs,Latitude,Longitude,State.Code,County.Code
0,2000-01-16,10030010,EPA,5.9,30.497478,-87.880258,1,3
1,2000-01-19,10030010,EPA,11.0,30.497478,-87.880258,1,3
2,2000-01-22,10030010,EPA,8.6,30.497478,-87.880258,1,3
3,2000-01-25,10030010,EPA,6.2,30.497478,-87.880258,1,3
4,2000-01-28,10030010,EPA,7.6,30.497478,-87.880258,1,3


In [15]:
cali_monitors = monitor_data.loc[(pd.to_datetime(monitor_data['Date']).dt.year == 2010) & (monitor_data['State.Code'] == '06') & ((monitor_data['County.Code'] == '001') | (monitor_data['County.Code'] == '007') | (monitor_data['County.Code'] == '009'))]

In [16]:
cali_monitors.shape

(298, 8)

In [18]:
cali_monitors.Longitude.min(), cali_monitors.Latitude.min(), cali_monitors.Longitude.max(), cali_monitors.Latitude.max()

(-121.843286, 37.687526, -120.680277, 39.757371)

In [19]:
dh_2010 = dh_2010.loc[(dh_2010.lon >= cali_monitors.Longitude.min()) & (dh_2010.lon <= cali_monitors.Longitude.max()) & (dh_2010.lat >= cali_monitors.Latitude.min()) & (dh_2010.lat <= cali_monitors.Latitude.max())]

In [20]:
fake_data_1 = fake_data_1.loc[(fake_data_1.lon >= cali_monitors.Longitude.min()) & (fake_data_1.lon <= cali_monitors.Longitude.max()) & (fake_data_1.lat >= cali_monitors.Latitude.min()) & (fake_data_1.lat <= cali_monitors.Latitude.max())]

In [21]:
fake_data_2 = fake_data_2.loc[(fake_data_2.lon >= cali_monitors.Longitude.min()) & (fake_data_2.lon <= cali_monitors.Longitude.max()) & (fake_data_2.lat >= cali_monitors.Latitude.min()) & (fake_data_2.lat <= cali_monitors.Latitude.max())]

In [22]:
fake_data_3 = fake_data_3.loc[(fake_data_3.lon >= cali_monitors.Longitude.min()) & (fake_data_3.lon <= cali_monitors.Longitude.max()) & (fake_data_3.lat >= cali_monitors.Latitude.min()) & (fake_data_3.lat <= cali_monitors.Latitude.max())]

In [24]:
cali_monitors = cali_monitors[['Longitude', 'Latitude', 'pm25_obs']]

In [25]:
cali_monitors = cali_monitors.rename(columns={"Longitude": "lon", "Latitude": "lat"})

In [26]:
training_data_2010 = cali_monitors.groupby(['lon','lat']).mean().reset_index()

In [27]:
training_data_2010

Unnamed: 0,lon,lat,pm25_obs
0,-121.843286,39.757371,7.987931
1,-121.784217,37.687526,8.575714
2,-120.680277,38.20185,4.883333


In [28]:
# round to nearest 0.05 
training_data_2010['approx_Long'] = round(training_data_2010.lon/0.01) * 0.01 - 0.005
training_data_2010['approx_Lat'] = round(training_data_2010.lat/0.01) * 0.01 - 0.005

In [29]:
training_data_2010.head()

Unnamed: 0,lon,lat,pm25_obs,approx_Long,approx_Lat
0,-121.843286,39.757371,7.987931,-121.845,39.755
1,-121.784217,37.687526,8.575714,-121.785,37.685
2,-120.680277,38.20185,4.883333,-120.685,38.195


In [23]:
training_data_2010.drop_duplicates(subset = ['approx_Long','approx_Lat'], 
                     keep = False, inplace = True) 

In [30]:
dh_2010 = dh_2010.rename(columns={"lon": "approx_Long", "lat": "approx_Lat", 'pm25' : 'pm25_dh'})
dh_2010.head()

Unnamed: 0,approx_Long,approx_Lat,pm25_dh
231557,-121.835,39.755,7.3
231558,-121.825,39.755,7.2
231559,-121.815,39.755,7.2
231560,-121.805,39.755,7.2
231561,-121.795,39.755,6.8


In [31]:
joined_df = training_data_2010.merge(dh_2010, how='left', on=['approx_Long','approx_Lat'])

In [32]:
fake_data_1 = fake_data_1.rename(columns={"lon": "approx_Long", "lat": "approx_Lat", 'pm25' : 'pm25_fakeDH1'})
fake_data_2 = fake_data_2.rename(columns={"lon": "approx_Long", "lat": "approx_Lat", 'pm25' : 'pm25_fakeDH2'})
fake_data_3 = fake_data_3.rename(columns={"lon": "approx_Long", "lat": "approx_Lat", 'pm25' : 'pm25_fakeDH3'})

In [33]:
joined_df = joined_df.merge(fake_data_1, how='left', on=['approx_Long','approx_Lat'])
joined_df = joined_df.merge(fake_data_2, how='left', on=['approx_Long','approx_Lat'])
joined_df = joined_df.merge(fake_data_3, how='left', on=['approx_Long','approx_Lat'])

In [34]:
joined_df.head()

Unnamed: 0,lon,lat,pm25_obs,approx_Long,approx_Lat,pm25_dh,pm25_fakeDH1,pm25_fakeDH2,pm25_fakeDH3
0,-121.843286,39.757371,7.987931,-121.845,39.755,,,,
1,-121.784217,37.687526,8.575714,-121.785,37.685,,,,
2,-120.680277,38.20185,4.883333,-120.685,38.195,6.8,10.566755,5.498932,15.060017


In [None]:
# cmaq_1 = pd.read_csv('./Data/cali_example/CMAQ_2010_align.csv')
# cmaq_2 = pd.read_csv('./Data/cali_example/CMAQ1_2010_align.csv')

In [None]:
# cmaq_1 = cmaq_1.drop(['Unnamed: 0'], axis = 1)
# cmaq_2 = cmaq_2.drop(['Unnamed: 0'], axis = 1)

In [None]:
# cmaq_1 = cmaq_1.rename(columns={"lon": "approx_Long", "lat": "approx_Lat", 'pm25' : 'pm25_CMAQ'})
# cmaq_2 = cmaq_2.rename(columns={"lon": "approx_Long", "lat": "approx_Lat", 'pm25' : 'pm25_CMAQ1'})

In [None]:
# joined_df = joined_df.merge(cmaq_1, how='left', on=['approx_Long','approx_Lat'])
# joined_df = joined_df.merge(cmaq_2, how='left', on=['approx_Long','approx_Lat'])

In [None]:
# joined_df.head()

In [35]:
joined_df = joined_df.fillna(method = 'bfill', axis = 0)

In [36]:
joined_df = joined_df.drop(['approx_Long', 'approx_Lat'], axis = 1)
joined_df = joined_df.rename(columns={"pm25_dh": "pred_dh", "pm25_fakeDH2": "pred_dhFake2", 'pm25_fakeDH1' : 'pred_dhFake1', "pm25_fakeDH3" : 'pred_dhFake3'})

In [39]:
joined_df.to_csv('./Data/cali_example/training_data_2010.csv', index = False)

In [48]:
dh_2010 = dh_2010.rename(columns={'approx_Long' : 'lon', 'approx_Lat' : 'lat', 'pm25_dh' : 'pm25'})
fake_data_1 = fake_data_1.rename(columns={'approx_Long' : 'lon', 'approx_Lat' : 'lat', 'pm25_fakeDH1' : 'pm25'})
fake_data_2 = fake_data_2.rename(columns={'approx_Long' : 'lon', 'approx_Lat' : 'lat', 'pm25_fakeDH2' : 'pm25'})
fake_data_3 = fake_data_3.rename(columns={'approx_Long' : 'lon', 'approx_Lat' : 'lat', 'pm25_fakeDH3' : 'pm25'})

In [52]:
dh_2010.to_csv('./Data/cali_example/dh_2010_align.csv')
fake_data_1.to_csv('./Data/cali_example/dhFake1_2010_align.csv')
fake_data_2.to_csv('./Data/cali_example/dhFake2_2010_align.csv')
fake_data_3.to_csv('./Data/cali_example/dhFake3_2010_align.csv')