# Predicting Ground Water Levels with Kernel Regression

In [1]:
from __future__ import absolute_import, division, print_function

import os
import json
import pyro
import torch
import pickle
import logging
import numpy as np
import pandas as pd
import seaborn as sns
import pyro.optim as optim
import pyro.contrib.gp as gp
import matplotlib.pyplot as plt
import pyro.distributions as dist
import matplotlib.animation as animation

from torch.distributions import constraints

from functools import partial
from pyro.infer.mcmc import NUTS
from pyro.infer.mcmc.api import MCMC
from mpl_toolkits.mplot3d import Axes3D
from IPython.display import Image, Video
from pyro.contrib.autoguide import AutoMultivariateNormal
from pyro.infer import EmpiricalMarginal, SVI, Trace_ELBO, JitTrace_ELBO

pyro.set_rng_seed(0)

In [2]:
%matplotlib inline
logging.basicConfig(format="%(message)s", level=logging.INFO)

# Enable validation checks
pyro.enable_validation(True)
smoke_test = "CI" in os.environ
assert pyro.__version__.startswith("0.4.1")

In [3]:
pyro.set_rng_seed(1)

In [4]:
if torch.cuda.is_available():
    torch.set_default_tensor_type('torch.cuda.FloatTensor')

## Helper Functions

In [5]:
def pairwise_distances(x, y=None):
    x_norm = (x**2).sum(1).view(-1, 1)
    if y is not None:
        y_t = torch.transpose(y, 0, 1)
        y_norm = (y**2).sum(1).view(1, -1)
    else:
        y_t = torch.transpose(x, 0, 1)
        y_norm = x_norm.view(1, -1)
    
    dist = x_norm + y_norm - 2.0 * torch.mm(x, y_t)
    dist = torch.clamp(dist, 0.0, np.inf)
    
    return dist

In [6]:
def summary(samples):
    site_stats = {}
    for site_name, values in samples.items():
        marginal_site = pd.DataFrame(values)
        describe = marginal_site.describe(percentiles=[.05, 0.25, 0.5, 0.75, 0.95]).transpose()
        site_stats[site_name] = describe[["mean", "std", "5%", "25%", "50%", "75%", "95%"]]
    return site_stats

In [7]:
def visualize_posterior(samples):
    import math
    
    sites = list(samples.keys())
    
    r = int(math.ceil(math.sqrt(len(samples))))
    fig, axs = plt.subplots(nrows=r, ncols=r, figsize=(15, 13))
    fig.suptitle("Marginal Posterior Density", fontsize=16)
    
    
    for i, ax in enumerate(axs.reshape(-1)):
        if i >= len(sites):
            break
        site = sites[i]
        sns.distplot(samples[site], ax=ax)
        ax.set_title(site)
        
    handles, labels = ax.get_legend_handles_labels()

## Defining the Model

### Generative Model
---
**Farm Factor**
\begin{align*}
    \ln(\delta) \sim \mathcal{N}(1.0, 0.5)
\end{align*}

**Distance Factors**
\begin{align*}
    \ln(\theta_w) \sim \mathcal{N}(0.0, 0.5) \\
    \ln(\theta_f) \sim \mathcal{N}(0.0, 0.5)
\end{align*}

**Variance**
\begin{align*}
    \sigma^2 \sim \text{Gam}(1.0, 1.0)
\end{align*}

**Seasonal Factors**
For season $s \in \mathcal{S}$
\begin{align*}
    \gamma_s \sim \mathcal{N}(0.0, 1.0)
\end{align*}

**Base Water Levels**

The base water levels are modeled as a simple AR(1) process. The details of this are as follows

\begin{align*}
    \mu_0 \sim \mathcal{N}(\gamma_{s_0}, 1.0) \\
\end{align*}
For $t = 1 \dots T$, we specify
\begin{align*}
    \mu_{t} \sim \mathcal{N}(\mu_{t - 1} + \gamma_{s_t}, 1.0)
\end{align*}

**Likelihood**

For $t = 0 \dots T$, we specify
\begin{align*}
    \mathbf{y}_t \sim \mathcal{N}(\mu_t - \delta \cdot K(X_{t,w}, X_{t,f})\ /\ \theta_f, 1.0)
\end{align*}

---

<img src="includes/hmm-model.png" alt="drawing" width="600"/>

In [8]:
def model(XW, YW, YF, WF_distances, gp=False):
    assert not torch._C._get_tracing_state()

    delta = pyro.sample("delta", dist.LogNormal(1.0, 0.5))

    if gp:
        theta_w = pyro.sample("theta_w", dist.LogNormal(0.0, 0.5))    
    else:
        sigma = pyro.sample("sigma", dist.Gamma(1.0, 1.0))
    
    theta_f = pyro.sample("theta_f", dist.LogNormal(0.0, 0.5))
    
    n_seasons = 3
    sf = pyro.sample("sf", dist.Normal(torch.zeros(n_seasons), 1.0))

    data_plate = pyro.plate("data", len(YW[0]))
        
    mu = 0
    for t in pyro.markov(range(len(YW))):
        if gp:
            sigma = torch.exp(-pairwise_distances(XW[t], XW[t]) / theta_w)
                
        mu = pyro.sample(
            "mu_{}".format(t), dist.Normal(mu + sf[t % n_seasons], 1.0)
        )
        
        mean = mu - delta * (YF[t] * torch.exp(-WF_distances[t] / theta_f)).sum(1)
        
        if gp:
            pyro.sample(
                "obs_{}".format(t), dist.MultivariateNormal(mean, sigma), obs=YW[t]
            )
        else:
            with pyro.plate("data_{}".format(t), len(YW[t])):
#             with data_plate:
                pyro.sample(
                    "obs_{}".format(t), dist.Normal(mean, sigma), obs=YW[t]
                )

In [9]:
def predict(XW, XF, YF, samples, gp=False):
    sigma = samples["sigma"]
    delta = samples["delta"]
    
    if gp:
        theta_w = samples["theta_w"]
        
    theta_f = samples["theta_f"]
    
    mu = list(zip(*[samples["mu_{}".format(i)] for i in range(len(YF))]))
    mu = np.array(mu)
    
    samples = []
    for t in range(len(YF)):
        YF_ = YF[t].cpu().numpy()
        
        if gp:
            pdx = pairwise_distances(XW[t]).cpu().numpy()
        pdf = pairwise_distances(XW[t], XF[t]).cpu().numpy()
    
        samples_ = []
        for i in range(len(delta)):
            if gp:
                sg = np.exp(-pdx / theta_w[i])
            else:
                sg = sigma[i]
                
            mean = mu[i, t] - delta[i] * np.matmul(np.exp(-pdf / theta_f[i]), YF_)
            samples_.append(np.random.normal(mean, sg))
            
        samples_ = np.array(samples_)
        samples.append(samples_)
        
    return samples

## Working with Sample Data

In [10]:
data = pd.read_csv("data/sample-data/data.csv", encoding="ISO-8859-1")

data_wells = data[data.type == "well"]
data_farms = data[data.type == "farm"]

XW, YW = [], []
for t in data_wells["timestep"].unique():
    data_ = data_wells[data_wells["timestep"] == t]

    XW.append(data_[["latitude", "longitude"]].values)
    YW.append(data_["observation"].values)
    
XW = XW[0]

XF = data_farms[["latitude", "longitude"]].values
YF = data_farms["observation"].values

In [11]:
plt.clf()
fig = plt.figure(figsize=(10, 10), dpi=100)

plt.ion()

plt.scatter(XF[:, 0], XF[:, 1], marker="s", s=7, color="lightgreen")

scat = plt.scatter(XW[:, 0], XW[:, 1], marker="s", s=20, c=[(0, 0, 0, 1)] * len(XW))
label = plt.text(0, 0, '', fontsize=12)

colors = []
for obs in YW:
    colors.append([min(1 - abs(x) / 15, 1) for x in obs])
    
colors = np.array(colors)

def update_plot(i, scat):
    scat.set_array(colors[i])
    label.set_text(["Sp", "Su", "Fa", "Wi"][i % 4])
    return scat,

anim = animation.FuncAnimation(fig, update_plot, frames=range(len(XW)), fargs=(scat,), interval=1000)

plt.gray()
plt.close()

<Figure size 432x288 with 0 Axes>

In [12]:
anim.save("includes/sample-data-animation.mp4", fps=1)

Animation.save using <class 'matplotlib.animation.FFMpegWriter'>
MovieWriter.run: running command: ['ffmpeg', '-f', 'rawvideo', '-vcodec', 'rawvideo', '-s', '1000x1000', '-pix_fmt', 'rgba', '-r', '1', '-loglevel', 'error', '-i', 'pipe:', '-vcodec', 'h264', '-pix_fmt', 'yuv420p', '-y', 'includes/sample-data-animation.mp4']


In [13]:
Video("includes/sample-data-animation.mp4")

In [14]:
XW = torch.tensor(XW)
YW = torch.tensor(YW)[..., None]

XF = torch.tensor(XF)
YF = torch.tensor(YF)[..., None]

In [15]:
timesteps = len(YW)

XW = XW.repeat(timesteps, 1, 1)

YF = YF.repeat(timesteps, 1, 1)
XF = XF.repeat(timesteps, 1, 1)

### Inference

In [16]:
train = False
use_gp = False
save_samples = False

samples_file = "data/sample-data/" + ("gp-samples" if use_gp else "kr-samples") + ".json"

In [17]:
try:
    with open(samples_file, "r") as f:
        samples = {k: np.array(v) for k, v in json.load(f).items()}
except:
    print("Failed to load samples file")

In [18]:
if train:
    WF_distances = [pairwise_distances(XW[i], XF[i]) for i in range(len(YW))]

    nuts_kernel = NUTS(partial(model, WF_distances=WF_distances, gp=use_gp))

    mcmc = MCMC(nuts_kernel, num_samples=100, warmup_steps=400)
    mcmc_run = mcmc.run(XW, YW, YF)

    samples = {k: v.detach().cpu().numpy() for k, v in mcmc.get_samples().items()}

In [19]:
if save_samples:
    samples_ = {k: v.tolist() for k, v in samples.items()}
    with open(samples_file, "w") as f:
        json.dump(samples_, f)

In [20]:
for site, values in summary(samples).items():
    print("Site: {}".format(site))
    print(values, "\n")

Site: delta
      mean      std        5%      25%      50%       75%       95%
0  0.69229  0.00361  0.686609  0.68985  0.69248  0.694578  0.699596 

Site: sigma
       mean       std       5%       25%       50%       75%       95%
0  0.494591  0.014084  0.47054  0.483666  0.496779  0.504777  0.517015 

Site: theta_f
     mean       std        5%       25%      50%       75%       95%
0  0.5986  0.004159  0.592153  0.595778  0.59831  0.600883  0.606531 

Site: sf
       mean       std        5%       25%       50%       75%       95%
0 -1.295759  0.267506 -1.680994 -1.476214 -1.338350 -1.100939 -0.870364
1 -1.782890  0.310849 -2.361393 -1.945206 -1.780396 -1.576902 -1.285234
2  2.591550  0.317839  2.090448  2.392160  2.572209  2.771738  3.226744
3  0.001271  0.315721 -0.481662 -0.224385 -0.012434  0.206743  0.489497 

Site: mu_0
      mean       std        5%       25%       50%       75%       95%
0 -5.23217  0.102741 -5.385782 -5.296615 -5.233654 -5.173446 -5.051048 

Site: mu_1
   

In [21]:
predictions = predict(XW, XF, YF, samples, use_gp)

## Working with Real Data

In [22]:
with open("data/dataset.pkl", "rb") as f:
    XF_r = [np.array(x) for x in pickle.load(f)]
    YF_r = [np.array(x) for x in pickle.load(f)]
                        
    XW_r = [np.array(x) for x in pickle.load(f)]
    YW_r = [np.array(x) for x in pickle.load(f)]

In [23]:
plt.clf()
fig = plt.figure(figsize=(10, 10), dpi=100)

plt.ion()

scat_f = plt.scatter(XF_r[0][:, 0], XF_r[0][:, 1], marker="s", s=7, color="lightgreen")

scat_w = plt.scatter(XW_r[0][:, 0], XW_r[0][:, 1], marker="s", s=20, c=[(0, 0, 0, 1)] * len(XW_r[0]))
label = plt.text(0, 0, '', fontsize=12)

def update_plot(i, scat_w, scat_f):
    scat_w.set_offsets(XW_r[i])
    scat_w.set_array(np.array([min(1 - abs(x[0]) / 50, 1) for x in YW_r[i]]))
    
    scat_f.set_offsets(XF_r[i])
    return scat_w, scat_f

anim = animation.FuncAnimation(fig, update_plot, frames=range(len(XW_r)), fargs=(scat_w, scat_f), interval=1000)

plt.gray()
plt.close()

<Figure size 432x288 with 0 Axes>

In [24]:
anim.save("includes/data-animation.mp4", fps=1)

Animation.save using <class 'matplotlib.animation.FFMpegWriter'>
MovieWriter.run: running command: ['ffmpeg', '-f', 'rawvideo', '-vcodec', 'rawvideo', '-s', '1000x1000', '-pix_fmt', 'rgba', '-r', '1', '-loglevel', 'error', '-i', 'pipe:', '-vcodec', 'h264', '-pix_fmt', 'yuv420p', '-y', 'includes/data-animation.mp4']


In [25]:
Video("includes/data-animation.mp4")

In [26]:
XF_r = [torch.tensor(x) for x in XF_r]
YF_r = [torch.tensor(x) for x in YF_r]

XW_r = [torch.tensor(x) for x in XW_r]
YW_r = [torch.tensor(x) for x in YW_r]

In [27]:
Xs = torch.cat(XW_r + XF_r)

In [28]:
x_mean = Xs.mean(0)
x_std = Xs.std(0)

In [29]:
XW_r = [(x - x_mean) / x_std for x in XW_r]
XF_r = [(x - x_mean) / x_std for x in XF_r]

### Inference

In [None]:
train = True
use_gp = False
save_samples = True

samples_file = "data/real-data/" + ("gp-samples" if use_gp else "kr-samples") + ".json"

In [31]:
try:
    with open(samples_file, "r") as f:
        samples = {k: np.array(v) for k, v in json.load(f).items()}
except:
    print("Failed to load samples file")

In [32]:
if train:
    WF_distances = [pairwise_distances(XW_r[i], XF_r[i]) for i in range(len(YW_r))]

    nuts_kernel = NUTS(partial(model, WF_distances=WF_distances, gp=use_gp))

    mcmc = MCMC(nuts_kernel, num_samples=100, warmup_steps=400)
    mcmc_run = mcmc.run(XW_r, YW_r, YF_r)

    samples = {k: v.detach().cpu().numpy() for k, v in mcmc.get_samples().items()}

In [33]:
if save_samples:
    samples_ = {k: v.tolist() for k, v in samples.items()}
    with open(samples_file, "w") as f:
        json.dump(samples_, f)

In [34]:
for site, values in summary(samples).items():
    print("Site: {}".format(site))
    print(values, "\n")

Site: delta
       mean       std        5%       25%       50%       75%       95%
0  0.000219  0.000072  0.000126  0.000165  0.000201  0.000247  0.000356 

Site: sigma
        mean       std         5%        25%        50%        75%        95%
0  23.180259  0.131626  22.958813  23.097648  23.183979  23.267464  23.403579 

Site: theta_f
       mean       std        5%       25%       50%       75%       95%
0  0.000265  0.000086  0.000132  0.000203  0.000262  0.000314  0.000401 

Site: sf
       mean       std        5%       25%       50%       75%       95%
0 -3.138915  0.499943 -4.043172 -3.460290 -3.067657 -2.799999 -2.419570
1 -1.147099  0.480017 -2.015271 -1.437748 -1.138173 -0.858489 -0.361831
2  1.178620  0.531882  0.373820  0.843479  1.159863  1.544441  1.960773 

Site: mu_0
        mean       std        5%        25%        50%        75%        95%
0 -21.714631  0.541971 -22.63309 -22.088746 -21.655123 -21.364281 -20.847652 

Site: mu_1
        mean       std         5%  

In [None]:
predictions = predict(XW_r, XF_r, YF_r, samples, use_gp)

In [None]:
predictions

In [None]:
YW_r