## Create Subset

In [1]:
import pandas as pd
import numpy as np
import os
import pickle as pk

In [3]:
training_data = pd.read_csv('./Data/cali_example/training_data_2010.csv', usecols = [1, 2, 3, 4, 5, 6])
av = pd.read_csv('./Data/cali_example/AV_2010_align.csv', usecols = [1,2,3])
gm = pd.read_csv('./Data/cali_example/GM_2010_align.csv', usecols = [1,2,3])
gs = pd.read_csv('./Data/cali_example/GS_2010_align.csv', usecols = [1,2,3])

In [5]:
training_data.shape, av.shape, gm.shape, gs.shape

((83, 6), (974544, 3), (974544, 3), (974544, 3))

In [6]:
training_data.head()

Unnamed: 0,lon,lat,pm25_obs,pred_AV,pred_GS,pred_GM
0,-124.0839,41.5608,2.835678,3.0,5.364638,1.940895
1,-122.92229,39.0327,3.07541,3.7,4.792302,2.075552
2,-122.9085,38.1224,5.13713,4.2,,3.891888
3,-122.8046,40.7864,2.069659,3.6,5.17278,2.075552
4,-122.633579,41.726892,4.501818,3.8,6.890557,1.612972


In [7]:
min_lon = -120.0
max_lon = -117.0
min_lat = 31.0
max_lat = 35.0

In [8]:
training_data = training_data.loc[(training_data.lon >= min_lon) & (training_data.lon <= max_lon) & (training_data.lat >= min_lat) & (training_data.lat <= max_lat)]

In [10]:
av = av.loc[(av.lon >= min_lon) & (av.lon <= max_lon) & (av.lat >= min_lat) & (av.lat <= max_lat)]

In [11]:
gm = gm.loc[(gm.lon >= min_lon) & (gm.lon <= max_lon) & (gm.lat >= min_lat) & (gm.lat <= max_lat)]

In [12]:
gs = gs.loc[(gs.lon >= min_lon) & (gs.lon <= max_lon) & (gs.lat >= min_lat) & (gs.lat <= max_lat)]

In [13]:
av.shape, gm.shape, gs.shape

((74100, 3), (74100, 3), (74100, 3))

In [21]:
av.isna().sum()

lat     0
lon     0
pm25    0
dtype: int64

In [15]:
gm.isna().sum()

lat     0
lon     0
pm25    0
dtype: int64

In [20]:
gs.isna().sum()

lat     0
lon     0
pm25    0
dtype: int64

In [19]:
av = av.fillna(method='bfill')
gs = gs.fillna(method='bfill')

In [22]:
training_data.to_csv('./Cali_Example/example/data/training_data_2010.csv', index = False)
av.to_csv('./Cali_Example/example/data/AV_2010_align.csv')
gm.to_csv('./Cali_Example/example/data/GM_2010_align.csv')
gs.to_csv('./Cali_Example/example/data/GS_2010_align.csv')

## Create file to visualize BNE predictions

In [78]:
num_coords = av.shape[0]
num_mcmc = 5000

In [85]:
_SAVE_ADDR_PREFIX = "./Cali_Example/result_ca_2010_subsegments/calibre_2d_annual_pm25_example_ca_2010"
family_name = 'hmc'

In [41]:
ensemble_lat_lon = []
ensemble_mean_val = []
ensemble_sample_val = []

for i in range(5):

    with open(os.path.join(_SAVE_ADDR_PREFIX,
                           '{}/ensemble_posterior_pred_mean_sample_{}.pkl'.format(family_name, i)), 'rb') as file:
        ensemble_mean_val.append(pk.load(file))

    with open(os.path.join(_SAVE_ADDR_PREFIX,
                           '{}/ensemble_posterior_pred_dist_sample_{}.pkl'.format(family_name, i)), 'rb') as file:
        ensemble_sample_val.append(pk.load(file))
        
    with open(os.path.join(_SAVE_ADDR_PREFIX,
                       '{}/ensemble_posterior_lat_lon_sample_{}.pkl'.format(family_name, i)), 'rb') as file:
        ensemble_lat_lon.append(pk.load(file))

In [77]:
ensemble_sample_val[0].shape

(14820, 5000)

In [79]:
lat_lon = np.stack(ensemble_lat_lon, axis = 0).reshape(num_coords, 2)
sample_val = np.stack(ensemble_sample_val, axis = 0).reshape(num_coords, num_mcmc)
mean_val = np.stack(ensemble_mean_val, axis = 0).reshape(num_coords, num_mcmc)

In [86]:
with open(os.path.join(_SAVE_ADDR_PREFIX,
                       '{}/ensemble_posterior_pred_dist_sample.pkl'.format(family_name)), 'wb') as file:
    pk.dump(sample_val, file, protocol=pk.HIGHEST_PROTOCOL)
with open(os.path.join(_SAVE_ADDR_PREFIX,
                       '{}/ensemble_posterior_pred_mean_sample.pkl'.format(family_name)), 'wb') as file:
    pk.dump(mean_val, file, protocol=pk.HIGHEST_PROTOCOL)

In [80]:
sample_val.shape

(74100, 5000)

In [82]:
post_mean_dict = {
    "overall": np.mean(sample_val, axis=1),
    "mean": np.mean(mean_val, axis=1),
    "resid": np.mean(sample_val - mean_val, axis=1)
}

In [83]:
lat_lon[:10]

array([[-0.5       ,  0.49993333],
       [-0.4966548 ,  0.49993333],
       [-0.49330962,  0.49993333],
       [-0.48996443,  0.49993333],
       [-0.4866218 ,  0.49993333],
       [-0.4832766 ,  0.49993333],
       [-0.4799314 ,  0.49993333],
       [-0.4765888 ,  0.49993333],
       [-0.4732436 ,  0.49993333],
       [-0.4698984 ,  0.49993333]], dtype=float32)

In [None]:
post_mean_dict['overall'].shape

In [None]:
locations = pd.read_csv('./Cali_Example/example/data/AV_2010_align.csv', usecols = [1,2,3])

In [None]:
locations.shape

In [None]:
locations['mean_overall'] = post_mean_dict['overall']
locations['mean_mean'] = post_mean_dict['mean']

In [None]:
locations.to_csv('./Data/cali_example/model_predictions.csv', index = False)