## Create Subset

In [1]:
import pandas as pd
import numpy as np
import os
import pickle as pk

In [33]:
training_data = pd.read_csv('./Data/cali_example/training_data_2010.csv', usecols = [1, 2, 3, 4, 5, 6])
av = pd.read_csv('./Data/cali_example/AV_2010_align.csv', usecols = [1,2,3])
gm = pd.read_csv('./Data/cali_example/GM_2010_align.csv', usecols = [1,2,3])
gs = pd.read_csv('./Data/cali_example/GS_2010_align.csv', usecols = [1,2,3])

In [3]:
training_data.shape, av.shape, gm.shape, gs.shape

((83, 6), (974544, 3), (974544, 3), (974544, 3))

In [4]:
training_ = training_data[~training_data['pred_GS'].isnull()]

In [5]:
training_data = training_

In [6]:
training_data.shape

(80, 6)

In [None]:
av.isna().sum()

In [None]:
gm.isna().sum()

In [None]:
gs.isna().sum()

In [None]:
av.head()

### Getting all indices with no null predictions

In [7]:
av_ = av[~av['pm25'].isnull()]
gs_ = gs[~gs['pm25'].isnull()]

In [8]:
av_.shape, gs_.shape

((731615, 3), (724896, 3))

In [9]:
indices = list(set(av_.index) & set(gs_.index))

In [10]:
len(indices)

723716

In [11]:
av = av.loc[indices]
gm = gm.loc[indices]
gs = gs.loc[indices]

In [12]:
av.shape, gm.shape, gs.shape

((723716, 3), (723716, 3), (723716, 3))

In [None]:
training_data.head()

In [None]:
min       max
x -124.48200 -114.1312
y   32.52883   42.0095

In [32]:
min_lon = -124.48200
max_lon = -114.1312
min_lat = 32.52883
max_lat = 42.0095

In [34]:
training_data = training_data.loc[(training_data.lon >= min_lon) & (training_data.lon <= max_lon) & (training_data.lat >= min_lat) & (training_data.lat <= max_lat)]

In [36]:
av = av.loc[(av.lon >= min_lon) & (av.lon <= max_lon) & (av.lat >= min_lat) & (av.lat <= max_lat)]

In [37]:
gm = gm.loc[(gm.lon >= min_lon) & (gm.lon <= max_lon) & (gm.lat >= min_lat) & (gm.lat <= max_lat)]

In [38]:
gs = gs.loc[(gs.lon >= min_lon) & (gs.lon <= max_lon) & (gs.lat >= min_lat) & (gs.lat <= max_lat)]

In [39]:
av.shape, gm.shape, gs.shape

((974544, 3), (974544, 3), (974544, 3))

In [None]:
av.isna().sum()

In [None]:
gm.isna().sum()

In [None]:
gs.isna().sum()

In [None]:
av = av.fillna(method='bfill')
gs = gs.fillna(method='bfill')

In [None]:
training_data.to_csv('./Cali_Example/example/data/training_data_2010.csv', index = False)
av.to_csv('./Cali_Example/example/data/AV_2010_align.csv')
gm.to_csv('./Cali_Example/example/data/GM_2010_align.csv')
gs.to_csv('./Cali_Example/example/data/GS_2010_align.csv')

## Create file to visualize BNE predictions

In [15]:
num_coords = av.shape[0]
num_mcmc = 5000

In [14]:
_SAVE_ADDR_PREFIX = "./Cali_Example/result_ca_2010_allsubsegments/calibre_2d_annual_pm25_example_ca_2010"
family_name = 'hmc'

In [16]:
ensemble_mean_val = []
ensemble_sample_val = []

num_subsegs = 5

for i in range(num_subsegs):
    print (i)
    with open(os.path.join(_SAVE_ADDR_PREFIX,
                           '{}/ensemble_posterior_pred_mean_sample_{}.pkl'.format(family_name, i)), 'rb') as file:
        ensemble_mean_val.append(pk.load(file))

    with open(os.path.join(_SAVE_ADDR_PREFIX,
                           '{}/ensemble_posterior_pred_dist_sample_{}.pkl'.format(family_name, i)), 'rb') as file:
        ensemble_sample_val.append(pk.load(file))

0
1
2
3
4


In [17]:
ensemble_sample_val[0].shape

(24124, 5000)

In [18]:
num_coords = ensemble_sample_val[0].shape[0]*num_subsegs

In [20]:
sample_val = np.stack(ensemble_sample_val, axis = 0).reshape(num_coords, num_mcmc)
mean_val = np.stack(ensemble_mean_val, axis = 0).reshape(num_coords, num_mcmc)

In [21]:
sample_val.shape, mean_val.shape

((120620, 5000), (120620, 5000))

In [22]:
import gc

In [23]:
del training_data
del gm
del gc

In [None]:
np.save(_SAVE_ADDR_PREFIX + '/{}/ensemble_posterior_pred_dist_sample.npy'.format(family_name), sample_val)

In [None]:
np.save(_SAVE_ADDR_PREFIX + '/{}/ensemble_posterior_pred_mean_sample.npy'.format(family_name), mean_val)

In [24]:
with open(os.path.join(_SAVE_ADDR_PREFIX,
                       '{}/ensemble_posterior_pred_dist_sample.pkl'.format(family_name)), 'wb') as file:
    pk.dump(sample_val, file, protocol=pk.HIGHEST_PROTOCOL)
with open(os.path.join(_SAVE_ADDR_PREFIX,
                       '{}/ensemble_posterior_pred_mean_sample.pkl'.format(family_name)), 'wb') as file:
    pk.dump(mean_val, file, protocol=pk.HIGHEST_PROTOCOL)

In [None]:
post_mean_dict = {
    "overall": np.mean(sample_val, axis=1),
    "mean": np.mean(mean_val, axis=1)
    "resid": np.mean(sample_val - mean_val, axis=1)
}

In [None]:
post_mean_dict['overall'].shape

In [27]:
av_sub = av.iloc[:num_coords]
av_sub = av_sub.drop(['pm25'], axis = 1)

In [29]:
av_sub['mean_overall'] = np.mean(sample_val, axis = 1)
av_sub['mean_mean'] = np.mean(mean_val, axis = 1)

In [31]:
av_sub.to_csv('./Data/cali_example/model_predictions_sub.csv', index = False)

In [None]:
locations = pd.read_csv('./Cali_Example/example/data/AV_2010_align.csv', usecols = [1,2,3])

In [None]:
locations.shape

In [None]:
locations['mean_overall'] = post_mean_dict['overall']
locations['mean_mean'] = post_mean_dict['mean']

In [None]:
locations.head()

In [None]:
locations.to_csv('./Data/cali_example/model_predictions_LA.csv', index = False)