## Create Subset

In [84]:
import pandas as pd
import numpy as np
import os
import pickle as pk
from collections import defaultdict

In [85]:
training_data = pd.read_csv('./Data/cali_example/training_data_2010.csv', usecols = [1, 2, 3, 4, 5, 6])
av = pd.read_csv('./Data/cali_example/AV_2010_align.csv', usecols = [1,2,3])
gm = pd.read_csv('./Data/cali_example/GM_2010_align.csv', usecols = [1,2,3])
gs = pd.read_csv('./Data/cali_example/GS_2010_align.csv', usecols = [1,2,3])

In [86]:
training_data.shape, av.shape, gm.shape, gs.shape

((83, 6), (974544, 3), (974544, 3), (974544, 3))

In [87]:
training_ = training_data[~training_data['pred_GS'].isnull()]

In [88]:
training_data = training_

In [89]:
training_data.shape

(80, 6)

In [90]:
av.isna().sum()

lat          0
lon          0
pm25    242929
dtype: int64

In [91]:
gm.isna().sum()

lat     0
lon     0
pm25    0
dtype: int64

In [92]:
gs.isna().sum()

lat          0
lon          0
pm25    249648
dtype: int64

In [93]:
av.head()

Unnamed: 0,lat,lon,pm25
0,42.005,-124.405,
1,42.005,-124.395,
2,42.005,-124.385,
3,42.005,-124.375,
4,42.005,-124.365,


### Getting all indices with no null predictions

In [94]:
av_ = av[~av['pm25'].isnull()]
gs_ = gs[~gs['pm25'].isnull()]

In [95]:
av_.shape, gs_.shape

((731615, 3), (724896, 3))

In [96]:
indices = list(set(av_.index) & set(gs_.index))

In [97]:
len(indices)

723716

In [98]:
av = av.loc[indices]
gm = gm.loc[indices]
gs = gs.loc[indices]

In [99]:
av.shape, gm.shape, gs.shape

((723716, 3), (723716, 3), (723716, 3))

In [100]:
training_data.head()

Unnamed: 0,lon,lat,pm25_obs,pred_AV,pred_GS,pred_GM
0,-124.0839,41.5608,2.835678,3.0,5.364638,1.940895
1,-122.92229,39.0327,3.07541,3.7,4.792302,2.075552
3,-122.8046,40.7864,2.069659,3.6,5.17278,2.075552
4,-122.633579,41.726892,4.501818,3.8,6.890557,1.612972
5,-122.4031,40.6304,6.252542,6.6,4.76404,2.075552


In [None]:
min       max
x -124.48200 -114.1312
y   32.52883   42.0095

In [17]:
min_lon = -119.0
max_lon = -116.0
min_lat = 31.0
max_lat = 34.0

In [18]:
training_data = training_data.loc[(training_data.lon >= min_lon) & (training_data.lon <= max_lon) & (training_data.lat >= min_lat) & (training_data.lat <= max_lat)]

In [20]:
av = av.loc[(av.lon >= min_lon) & (av.lon <= max_lon) & (av.lat >= min_lat) & (av.lat <= max_lat)]

In [21]:
gm = gm.loc[(gm.lon >= min_lon) & (gm.lon <= max_lon) & (gm.lat >= min_lat) & (gm.lat <= max_lat)]

In [22]:
gs = gs.loc[(gs.lon >= min_lon) & (gs.lon <= max_lon) & (gs.lat >= min_lat) & (gs.lat <= max_lat)]

In [101]:
av.shape, gm.shape, gs.shape

((723716, 3), (723716, 3), (723716, 3))

In [102]:
av.isna().sum()

lat     0
lon     0
pm25    0
dtype: int64

In [103]:
gm.isna().sum()

lat     0
lon     0
pm25    0
dtype: int64

In [104]:
gs.isna().sum()

lat     0
lon     0
pm25    0
dtype: int64

In [105]:
training_data.to_csv('./Cali_Example/example/data/training_data_2010.csv', index = False)
av.to_csv('./Cali_Example/example/data/AV_2010_align.csv')
gm.to_csv('./Cali_Example/example/data/GM_2010_align.csv')
gs.to_csv('./Cali_Example/example/data/GS_2010_align.csv')

## Create file to visualize BNE predictions

In [15]:
num_coords = av.shape[0]
num_mcmc = 5000

In [32]:
_SAVE_ADDR_PREFIX = "./Cali_Example/result_ca_2010_subsegments/calibre_2d_annual_pm25_example_ca_2010"
family_name = 'hmc'

In [28]:
def Merge(dict1, dict2): 
    res = {**dict1, **dict2} 
    return res

In [40]:
ensemble_mean_val = []
ensemble_uncn_val = []

num_subsegs = 5

for i in range(num_subsegs):
    print (i)
    with open(os.path.join(_SAVE_ADDR_PREFIX,
                           '{}/ensemble_mean_dict_{}.pkl'.format(family_name, i)), 'rb') as file:
        ensemble_mean_val.append(pk.load(file))
        

    with open(os.path.join(_SAVE_ADDR_PREFIX,
                           '{}/ensemble_uncn_dict_{}.pkl'.format(family_name, i)), 'rb') as file:
        ensemble_uncn_val.append(pk.load(file))

0
1
2
3
4


In [53]:
num_coords = 0
mean_dict = defaultdict()
unc_dict = defaultdict()

for i in range(num_subsegs):
    num_coords += ensemble_mean_val[i]['overall'].shape[0]

In [64]:
post_mean_dict = {'overall': None, 'mean': None, 'resid': None}
post_uncn_dict = {'overall': None, 'mean': None, 'resid': None, 'noise': None}

In [67]:
for key in post_mean_dict:
    print (key)

overall
mean
resid


In [68]:
for key in post_mean_dict:
    post_mean_dict[key] = np.concatenate([ensemble_mean_val[i][key] for i in range(num_subsegs)], axis = None).reshape(num_coords)

for key in post_uncn_dict:
    post_uncn_dict[key] = np.concatenate([ensemble_uncn_val[i][key] for i in range(num_subsegs)], axis = None).reshape(num_coords)

In [73]:
# np.save(_SAVE_ADDR_PREFIX + '/{}/ensemble_posterior_pred_dist_sample.npy'.format(family_name), sample_val)

In [74]:
# np.save(_SAVE_ADDR_PREFIX + '/{}/ensemble_posterior_pred_mean_sample.npy'.format(family_name), mean_val)

In [76]:
with open(os.path.join(_SAVE_ADDR_PREFIX,
                       '{}/ensemble_mean_dict.pkl'.format(family_name)), 'wb') as file:
    pk.dump(post_mean_dict, file, protocol=pk.HIGHEST_PROTOCOL)
with open(os.path.join(_SAVE_ADDR_PREFIX,
                       '{}/ensemble_uncn_dict.pkl'.format(family_name)), 'wb') as file:
    pk.dump(post_uncn_dict, file, protocol=pk.HIGHEST_PROTOCOL)

In [27]:
av_sub = av.iloc[:num_coords]
av_sub = av_sub.drop(['pm25'], axis = 1)

In [81]:
locations = av[['lat', 'lon']]
locations['mean_overall'] = post_mean_dict['overall']
locations['mean_mean'] = post_mean_dict['mean']

In [83]:
locations.to_csv('./Data/cali_example/model_predictions_LA.csv', index = False)