# Predicting Ground Water Levels with Kernel Regression

In [1]:
from __future__ import absolute_import, division, print_function

import os
import json
import pyro
import torch
import pickle
import logging
import numpy as np
import pandas as pd
import seaborn as sns
import pyro.optim as optim
import pyro.contrib.gp as gp
import matplotlib.pyplot as plt
import pyro.distributions as dist
import matplotlib.animation as animation

from torch.distributions import constraints

from functools import partial
from pyro.infer.mcmc import NUTS
from pyro.infer.mcmc.api import MCMC
from mpl_toolkits.mplot3d import Axes3D
from IPython.display import Image, Video
from pyro.contrib.autoguide import AutoMultivariateNormal
from pyro.infer import EmpiricalMarginal, SVI, Trace_ELBO, JitTrace_ELBO

pyro.set_rng_seed(0)

In [2]:
%matplotlib inline
logging.basicConfig(format="%(message)s", level=logging.INFO)

# Enable validation checks
pyro.enable_validation(True)
smoke_test = "CI" in os.environ
assert pyro.__version__.startswith("0.4.1")

In [3]:
if torch.cuda.is_available():
    torch.set_default_tensor_type('torch.cuda.FloatTensor')

## Helper Functions

In [4]:
def pairwise_distances(x, y=None):
    x_norm = (x**2).sum(1).view(-1, 1)
    if y is not None:
        y_t = torch.transpose(y, 0, 1)
        y_norm = (y**2).sum(1).view(1, -1)
    else:
        y_t = torch.transpose(x, 0, 1)
        y_norm = x_norm.view(1, -1)
    
    dist = x_norm + y_norm - 2.0 * torch.mm(x, y_t)
    dist = torch.clamp(dist, 0.0, np.inf)
    
    return dist

In [5]:
def summary(samples):
    site_stats = {}
    for site_name, values in samples.items():
        marginal_site = pd.DataFrame(values)
        describe = marginal_site.describe(percentiles=[.05, 0.25, 0.5, 0.75, 0.95]).transpose()
        site_stats[site_name] = describe[["mean", "std", "5%", "25%", "50%", "75%", "95%"]]
    return site_stats

In [6]:
def visualize_posterior(samples):
    import math
    
    sites = list(samples.keys())
    
    r = int(math.ceil(math.sqrt(len(samples))))
    fig, axs = plt.subplots(nrows=r, ncols=r, figsize=(15, 13))
    fig.suptitle("Marginal Posterior Density", fontsize=16)
    
    
    for i, ax in enumerate(axs.reshape(-1)):
        if i >= len(sites):
            break
        site = sites[i]
        sns.distplot(samples[site], ax=ax)
        ax.set_title(site)
        
    handles, labels = ax.get_legend_handles_labels()

## Defining the Model

### Generative Model
---
**Farm Factor**
\begin{align*}
    \ln(\delta) \sim \mathcal{N}(1.0, 0.5)
\end{align*}

**Distance Factors**
\begin{align*}
    \ln(\theta_w) \sim \mathcal{N}(0.0, 0.5) \\
    \ln(\theta_f) \sim \mathcal{N}(0.0, 0.5)
\end{align*}

**Variance**
\begin{align*}
    \sigma^2 \sim \text{Gam}(1.0, 1.0)
\end{align*}

**Seasonal Factors**
For season $s \in \mathcal{S}$
\begin{align*}
    \gamma_s \sim \mathcal{N}(0.0, 1.0)
\end{align*}

**Base Water Levels**

The base water levels are modeled as a simple AR(1) process. The details of this are as follows

\begin{align*}
    \mu_0 \sim \mathcal{N}(\gamma_{s_0}, 1.0) \\
\end{align*}
For $t = 1 \dots T$, we specify
\begin{align*}
    \mu_{t} \sim \mathcal{N}(\mu_{t - 1} + \gamma_{s_t}, 1.0)
\end{align*}

**Likelihood**

For $t = 0 \dots T$, we specify
\begin{align*}
    \mathbf{y}_t \sim \mathcal{N}(\mu_t - \delta \cdot K(X_{t,w}, X_{t,f})\ /\ \theta_f, 1.0)
\end{align*}

---

<img src="includes/hmm-model.png" alt="drawing" width="600"/>

In [7]:
n_seasons=4

In [8]:
def model1(XW, YW, YF, WF_distances, n_seasons=4, seasons=None, gp=False):
    assert not torch._C._get_tracing_state()

    delta = pyro.sample("delta", dist.LogNormal(1.0, 0.5))

    if gp:
        theta_w = pyro.sample("theta_w", dist.LogNormal(0.0, 0.5))    
    else:
        sigma = pyro.sample("sigma", dist.Gamma(1.0, 1.0))
    
    theta_f = pyro.sample("theta_f", dist.LogNormal(0.0, 0.5))
    
    sf = pyro.sample("sf", dist.Normal(torch.zeros(n_seasons), 1.0))
    if seasons is None:
        seasons = np.tile(np.arange(n_seasons), int(len(YW) / n_seasons + 1))[:len(YW)]

    mu = 0
    for t in pyro.markov(range(len(YW))):
        if gp:
            sigma = torch.exp(-pairwise_distances(XW[t], XW[t]) / theta_w)
                
        mu = pyro.sample(
            "mu_{}".format(t), dist.Normal(mu + sf[seasons[t]], 1.0)
        )
        
        mean = mu - delta * torch.mm(torch.exp(-WF_distances[t] / theta_f), YF[t])
        
        if gp:
            pyro.sample(
                "obs_{}".format(t), dist.MultivariateNormal(mean, sigma), obs=YW[t]
            )
        else:
            with pyro.plate("data_{}".format(t), len(YW[t])):
                pyro.sample(
                    "obs_{}".format(t), dist.Normal(mean, sigma), obs=YW[t]
                )

In [9]:
def predict1(XW, XF, YF, samples, n_seasons=4, seasons=None, gp=False, recon=False):
    sigma = samples["sigma"]
    delta = samples["delta"]
    
    if gp:
        theta_w = samples["theta_w"]
        
    theta_f = samples["theta_f"]
    
    sf = samples["sf"]
    if seasons is None:
        seasons = np.tile(np.arange(n_seasons), int(len(YW) / n_seasons + 1))[:len(YW)]
    
    mu = list(zip(*[samples["mu_{}".format(i)] for i in range(len(YF))]))
    mu = np.array(mu)
    
    samples = []
    for t in range(len(YF)):
        YF_ = YF[t].cpu().numpy()
        
        if gp:
            pdx = pairwise_distances(XW[t]).cpu().numpy()
        pdf = pairwise_distances(XW[t], XF[t]).cpu().numpy()
    
        samples_ = []
        for i in range(len(delta)):
            if gp:
                sg = np.exp(-pdx / theta_w[i])
            else:
                sg = sigma[i]
                
            mean = mu[i, t] - delta[i] * np.matmul(np.exp(-pdf / theta_f[i]), YF_) + sf[i][seasons[t]]
            if recon:
                samples_.append(mean)
            else:
                samples_.append(np.random.normal(mean, sg))
            
        samples_ = np.array(samples_)
        samples.append(samples_)
        
    return samples

## Working with Sample Data

In [10]:
data = pd.read_csv("data/sample-data/data.csv", encoding="ISO-8859-1")

data_wells = data[data.type == "well"]
data_farms = data[data.type == "farm"]

XW, YW = [], []
for t in data_wells["timestep"].unique():
    data_ = data_wells[data_wells["timestep"] == t]

    XW.append(data_[["latitude", "longitude"]].values)
    YW.append(data_["observation"].values)
    
XW = XW[0]

XF = data_farms[["latitude", "longitude"]].values
YF = data_farms["observation"].values

In [11]:
plt.clf()
fig = plt.figure(figsize=(10, 10), dpi=100)

plt.ion()

plt.scatter(XF[:, 0], XF[:, 1], marker="s", s=7, color="lightgreen")

scat = plt.scatter(XW[:, 0], XW[:, 1], marker="s", s=20, c=[(0, 0, 0, 1)] * len(XW))
label = plt.text(0, 0, '', fontsize=12)

colors = []
for obs in YW:
    colors.append([min(1 - abs(x) / 15, 1) for x in obs])
    
colors = np.array(colors)

def update_plot(i, scat):
    scat.set_array(colors[i])
    label.set_text(["Sp", "Su", "Fa", "Wi"][i % 4])
    return scat,

anim = animation.FuncAnimation(fig, update_plot, frames=range(len(XW)), fargs=(scat,), interval=1000)

plt.gray()
plt.close()

<Figure size 432x288 with 0 Axes>

In [12]:
anim.save("includes/sample-data-animation.mp4", fps=1)

Animation.save using <class 'matplotlib.animation.FFMpegWriter'>
MovieWriter.run: running command: ['ffmpeg', '-f', 'rawvideo', '-vcodec', 'rawvideo', '-s', '1000x1000', '-pix_fmt', 'rgba', '-r', '1', '-loglevel', 'error', '-i', 'pipe:', '-vcodec', 'h264', '-pix_fmt', 'yuv420p', '-y', 'includes/sample-data-animation.mp4']


In [13]:
Video("includes/sample-data-animation.mp4")

In [14]:
XW = torch.tensor(XW)
YW = torch.tensor(YW)[..., None]

XF = torch.tensor(XF)
YF = torch.tensor(YF)[..., None]

In [15]:
timesteps = len(YW)

XW = XW.repeat(timesteps, 1, 1)

YF = YF.repeat(timesteps, 1, 1)
XF = XF.repeat(timesteps, 1, 1)

### Inference

In [16]:
pyro.set_rng_seed(1)

In [17]:
train = True
use_gp = False
save_samples = True

samples_file = "data/sample-data/" + ("gp-samples" if use_gp else "kr-samples") + ".json"

In [18]:
try:
    with open(samples_file, "r") as f:
        samples = {k: np.array(v) for k, v in json.load(f).items()}
except:
    print("Failed to load samples file")

In [19]:
if train:
    WF_distances = [pairwise_distances(XW[i], XF[i]) for i in range(len(YW))]

    nuts_kernel = NUTS(partial(model1, n_seasons=4, WF_distances=WF_distances, gp=use_gp))

    mcmc = MCMC(nuts_kernel, num_samples=100, warmup_steps=400)
    mcmc_run = mcmc.run(XW, YW, YF)

    samples = {k: v.detach().cpu().numpy() for k, v in mcmc.get_samples().items()}

sample: 100%|██████████| 500/500 [1:04:39<00:00,  7.76s/it, step size=7.93e-02, acc. prob=0.969]


In [20]:
if save_samples:
    samples_ = {k: v.tolist() for k, v in samples.items()}
    with open(samples_file, "w") as f:
        json.dump(samples_, f)

In [21]:
for site, values in summary(samples).items():
    print("Site: {}".format(site))
    print(values, "\n")

Site: delta
       mean       std        5%       25%       50%       75%       95%
0  0.692136  0.000927  0.690626  0.691565  0.692088  0.692621  0.693916 

Site: sigma
       mean       std        5%      25%      50%       75%       95%
0  0.477168  0.003306  0.471929  0.47485  0.47708  0.479625  0.481943 

Site: theta_f
       mean       std       5%       25%       50%       75%       95%
0  0.598855  0.001004  0.59723  0.598261  0.598859  0.599406  0.600555 

Site: sf
       mean       std        5%       25%       50%       75%       95%
0 -1.300341  0.341198 -1.871682 -1.511644 -1.271030 -1.050845 -0.800381
1 -1.778845  0.268337 -2.252004 -1.916970 -1.799101 -1.639649 -1.328696
2  2.628817  0.343962  2.018205  2.414313  2.612782  2.824041  3.197797
3  0.017245  0.343971 -0.476871 -0.243713  0.013307  0.247328  0.583605 

Site: mu_0
       mean      std        5%       25%       50%       75%       95%
0 -5.282558  0.03037 -5.332829 -5.300073 -5.287637 -5.264877 -5.235152 

Site

## Working with Real Data

In [22]:
with open("data/dataset.pkl", "rb") as f:
    XF_r = [np.array(x) for x in pickle.load(f)]
    YF_r = [np.array(x) for x in pickle.load(f)]
                        
    XW_r = [np.array(x) for x in pickle.load(f)]
    YW_r = [np.array(x) for x in pickle.load(f)]
    
    XS_r = pickle.load(f)

In [23]:
plt.clf()
fig = plt.figure(figsize=(10, 10), dpi=100)

plt.ion()

scat_f = plt.scatter(XF_r[0][:, 0], XF_r[0][:, 1], marker="s", s=7, color="lightgreen")

scat_w = plt.scatter(XW_r[0][:, 0], XW_r[0][:, 1], marker="s", s=20, c=[(0, 0, 0, 1)] * len(XW_r[0]))
label = plt.text(0, 0, '', fontsize=12)

def update_plot(i, scat_w, scat_f):
    scat_w.set_offsets(XW_r[i])
    scat_w.set_array(np.array([min(1 - abs(x[0]) / 50, 1) for x in YW_r[i]]))
    
    scat_f.set_offsets(XF_r[i])
    return scat_w, scat_f

anim = animation.FuncAnimation(fig, update_plot, frames=range(len(XW_r)), fargs=(scat_w, scat_f), interval=1000)

plt.gray()
plt.close()

<Figure size 432x288 with 0 Axes>

In [24]:
anim.save("includes/data-animation.mp4", fps=1)

Animation.save using <class 'matplotlib.animation.FFMpegWriter'>
MovieWriter.run: running command: ['ffmpeg', '-f', 'rawvideo', '-vcodec', 'rawvideo', '-s', '1000x1000', '-pix_fmt', 'rgba', '-r', '1', '-loglevel', 'error', '-i', 'pipe:', '-vcodec', 'h264', '-pix_fmt', 'yuv420p', '-y', 'includes/data-animation.mp4']


In [25]:
Video("includes/data-animation.mp4")

In [26]:
XF_r = [torch.tensor(x) for x in XF_r]
YF_r = [torch.tensor(x) for x in YF_r]

XW_r = [torch.tensor(x) for x in XW_r]
YW_r = [torch.tensor(x) for x in YW_r]

XS_r = torch.tensor(XS_r) - 1

In [27]:
Xs = torch.cat(XW_r + XF_r)

In [28]:
x_mean = Xs.mean(0)
x_std = Xs.std(0)

In [29]:
XW_r = [(x - x_mean) / x_std for x in XW_r]
XF_r = [(x - x_mean) / x_std for x in XF_r]

### Inference

In [30]:
pyro.set_rng_seed(1)

In [31]:
train = True
use_gp = False
save_samples = True

samples_file = "data/real-data/" + ("gp-samples" if use_gp else "kr-samples") + ".json"

In [32]:
try:
    with open(samples_file, "r") as f:
        samples = {k: np.array(v) for k, v in json.load(f).items()}
except:
    print("Failed to load samples file")

In [33]:
if train:
    WF_distances = [pairwise_distances(XW_r[i], XF_r[i]) for i in range(len(YW_r))]

    nuts_kernel = NUTS(partial(model1, n_seasons=3, seasons=XS_r, WF_distances=WF_distances, gp=use_gp))

    mcmc = MCMC(nuts_kernel, num_samples=100, warmup_steps=400)
    mcmc_run = mcmc.run(XW_r, YW_r, YF_r)

    samples = {k: v.detach().cpu().numpy() for k, v in mcmc.get_samples().items()}

sample: 100%|██████████| 500/500 [22:46<00:00,  2.73s/it, step size=1.89e-01, acc. prob=0.921]  


In [34]:
if save_samples:
    samples_ = {k: v.tolist() for k, v in samples.items()}
    with open(samples_file, "w") as f:
        json.dump(samples_, f)

In [35]:
for site, values in summary(samples).items():
    print("Site: {}".format(site))
    print(values, "\n")

Site: delta
       mean       std        5%       25%       50%       75%       95%
0  0.000082  0.000008  0.000071  0.000076  0.000081  0.000087  0.000094 

Site: sigma
       mean       std         5%        25%        50%       75%        95%
0  22.62882  0.119883  22.456537  22.535894  22.607881  22.71427  22.830021 

Site: theta_f
       mean       std        5%       25%       50%       75%       95%
0  0.049288  0.004775  0.041594  0.046395  0.049497  0.052175  0.057354 

Site: sf
       mean       std        5%       25%       50%       75%       95%
0 -5.227908  0.413404 -5.923225 -5.496590 -5.253325 -4.937624 -4.516511
1 -1.309093  0.378630 -1.879215 -1.572286 -1.290800 -1.030404 -0.749622
2  3.737078  0.476756  2.915486  3.464752  3.733816  4.017878  4.546121 

Site: mu_0
        mean     std         5%        25%       50%        75%        95%
0 -22.033148  0.5547 -22.950399 -22.388257 -22.05194 -21.664929 -21.143717 

Site: mu_1
        mean       std         5%        25

## Alternate Model

### Defining the Model

In [36]:
def model2(XW, YW, YF, WF_distances, n_seasons=3, seasons=None):
    assert not torch._C._get_tracing_state()

    delta_c = pyro.sample("delta_c", dist.LogNormal(1.0, 0.5))
    delta_p = pyro.sample("delta_p", dist.LogNormal(1.0, 0.5))

    sigma = pyro.sample("sigma", dist.Gamma(1.0, 1.0))
    theta_f = pyro.sample("theta_f", dist.LogNormal(1.0, 0.5))

    sf = pyro.sample("sf", dist.Normal(torch.zeros(n_seasons), 1.0))
    if seasons is None:
        seasons = np.tile(np.arange(n_seasons), int(len(YW) / n_seasons + 1))[:len(YW)]

    y = YW[0]
    ff = torch.mm(torch.exp(-WF_distances[0] / theta_f), YF[0])

    for t in pyro.markov(range(1, len(YW))):
        with pyro.plate("data_{}".format(t), len(YW[t])):
            mean = y - delta_p * ff + sf[seasons[t]]
            ff = torch.mm(torch.exp(-WF_distances[t] / theta_f), YF[t])

            mean -= delta_c * ff

            y = pyro.sample("obs_{}".format(t), dist.Normal(mean, sigma), obs=YW[t])

In [37]:
def predict2(XW, XF, YF, samples, n_seasons=3, seasons=None, recon=False):
    sigma = samples["sigma"]
    delta_c = samples["delta_c"]
    delta_p = samples["delta_p"]

    theta_f = samples["theta_f"]

    sf = samples["sf"]
    if seasons is None:
        seasons = np.tile(np.arange(n_seasons), int(len(YW) / n_seasons + 1))[:len(YW)]
    
    YF = [x.cpu().numpy() for x in YF]

    samples = []
    samples_ = [0] * len(sigma)
    for t in range(1, len(YF)):
        pdf_c = pairwise_distances(XW[t], XF[t]).cpu().numpy()
        pdf_p = pairwise_distances(XW[t - 1], XF[t - 1]).cpu().numpy()

        for i in range(len(sigma)):
            sg = sigma[i]
            mean = (
                samples_[i]
                - delta_c[i] * np.matmul(np.exp(-pdf_c / theta_f[i]), YF[t])
                - delta_p[i] * np.matmul(np.exp(-pdf_p / theta_f[i]), YF[t-1])
                + sf[i][seasons[t]]
            )

            if recon:
                samples_[i] = mean
            else:
                samples_[i] = np.random.normal(mean, sg)

        samples_ = np.array(samples_)
        samples.append(samples_)

    return samples

### Preparing the Data

In [38]:
intersection = XW_r[0]

for arr in XW_r[1:]:
    indices = torch.zeros(len(arr), dtype = torch.bool)
    for elem in intersection:
        indices = indices | (torch.abs(arr - elem).sum(1) < 1e-8)
    intersection = arr[indices]
    
for i in range(len(XW_r)):
    indices = torch.zeros(len(XW_r[i]), dtype = torch.bool)
    for elem in intersection:
        indices = indices | (torch.abs(XW_r[i] - elem).sum(1) < 1e-8)
        
    XW_r[i] = XW_r[i][indices]
    YW_r[i] = YW_r[i][indices]

### Inference

In [39]:
pyro.set_rng_seed(1)

In [40]:
train = True
save_samples = True

samples_file = "data/real-data/" + "alt-samples" + ".json"

In [41]:
try:
    with open(samples_file, "r") as f:
        samples = {k: np.array(v) for k, v in json.load(f).items()}
except:
    print("Failed to load samples file")

In [42]:
if train:
    WF_distances = [pairwise_distances(XW_r[i], XF_r[i]) for i in range(len(YW_r))]

    nuts_kernel = NUTS(partial(model2, seasons=XS_r, WF_distances=WF_distances))

    mcmc = MCMC(nuts_kernel, num_samples=100, warmup_steps=400)
    mcmc_run = mcmc.run(XW_r, YW_r, YF_r)

    samples = {k: v.detach().cpu().numpy() for k, v in mcmc.get_samples().items()}

sample: 100%|██████████| 500/500 [03:46<00:00,  2.20it/s, step size=2.71e-01, acc. prob=0.914]


In [43]:
if save_samples:
    samples_ = {k: v.tolist() for k, v in samples.items()}
    with open(samples_file, "w") as f:
        json.dump(samples_, f)

In [44]:
for site, values in summary(samples).items():
    print("Site: {}".format(site))
    print(values, "\n")

Site: delta_c
      mean       std        5%     25%       50%       75%       95%
0  0.03395  0.007396  0.020659  0.0292  0.034532  0.038911  0.044599 

Site: delta_p
       mean       std        5%       25%       50%       75%       95%
0  0.013508  0.003355  0.008386  0.011258  0.013171  0.015721  0.019617 

Site: sigma
       mean       std        5%       25%       50%       75%      95%
0  3.049775  0.088181  2.917461  2.991533  3.045503  3.094855  3.21229 

Site: theta_f
      mean       std        5%       25%       50%      75%       95%
0  0.00001  0.000002  0.000008  0.000009  0.000009  0.00001  0.000014 

Site: sf
       mean       std        5%       25%       50%       75%       95%
0  1.361661  0.202846  1.012047  1.225183  1.367800  1.516373  1.711813
1 -0.883618  0.191138 -1.176474 -1.023153 -0.898001 -0.761632 -0.545445
2  0.682479  0.171281  0.409735  0.580201  0.682519  0.785730  0.977548 



In [45]:
preds = predict2(XW_r, XF_r, YF_r, samples, recon=True)
preds = np.array(preds).mean(1)

preds = [YW_r[0].cpu().numpy()] + list(preds)
for i in range(1, len(preds)):
    preds[i] = np.minimum(preds[i] + preds[i - 1], 0)
    
preds = np.array(preds)

In [46]:
x = np.array([x.cpu().numpy() for x in YW_r])

In [47]:
np.concatenate((x, preds), axis=2)

array([[[-1.09900000e+01, -1.09900000e+01],
        [-1.93900000e+01, -1.93900000e+01],
        [-2.43800000e+01, -2.43800000e+01],
        [-1.99500000e+01, -1.99500000e+01],
        [-3.12400000e+01, -3.12400000e+01],
        [-4.47300000e+01, -4.47300000e+01]],

       [[-1.37400000e+01, -1.11911395e+01],
        [-2.49900000e+01, -1.98415004e+01],
        [-2.50600000e+01, -2.45829027e+01],
        [-1.70300000e+01, -2.01511438e+01],
        [-3.40000000e+01, -3.14411419e+01],
        [-4.53000000e+01, -4.89890996e+01]],

       [[-1.50000000e+01, -1.00306177e+01],
        [-2.38000000e+01, -1.89316922e+01],
        [-2.21400000e+01, -2.34248382e+01],
        [-1.82700000e+01, -1.89906284e+01],
        [-3.12900000e+01, -3.02806257e+01],
        [-4.53800000e+01, -5.35047305e+01]],

       [[-1.22200000e+01, -9.75371408e+00],
        [-1.90800000e+01, -1.89055025e+01],
        [-2.31000000e+01, -2.31503919e+01],
        [-2.31000000e+01, -1.87137314e+01],
        [-3.18000000e+01, 