The code is taken from [rep](https://github.com/tonyduan/normalizing-flows)

In [1]:
%matplotlib inline

from comet_ml import Experiment
import numpy as np
import scipy as sp
import scipy.stats
import itertools
import logging
import matplotlib.pyplot as plt

import torch
import torch.optim as optim
import torch.nn.functional as F
from argparse import ArgumentParser
from torch.distributions import MultivariateNormal
from torch.utils.data import DataLoader, TensorDataset

from nf.flows import *
from nf.models import NormalizingFlowModel
import matplotlib.pyplot as plt

# Intro
**Problem:** how to estimate the pdf of final distribution at each point? 

**Idea:** Let's define the bijection $z_k=f(z_0)$ between simple distribution of $z_0$ with known pdf and our distribution $z_k$ with unknown pdf
![](https://2.bp.blogspot.com/-g37e2x1miRo/Wl-g8ajU11I/AAAAAAAAHkY/PbIorxOav_Y61yFJeXsQLRlcKTzlkykYwCLcBGAs/s1600/shakir_danilo_slide.png)

**Problem:** ... but known pdf is changed at each point after each transformation $f$
![](https://2.bp.blogspot.com/-1vyL7LpM1io/Wl-ghB0yOiI/AAAAAAAAHkM/_U94kuVeQpk22J5Mg0lbLK-EdMDkaQWggCLcBGAs/s1600/flow1.png)

**Solution:** The Jacobian is exactly the factor how volume is changed at each point $$J_k=|\frac{\partial f_k}{\partial z_k}|$$

### We can stack multiple transformations f
![](https://lilianweng.github.io/lil-log/assets/images/normalizing-flow.png)

![](https://www.google.ru/url?sa=i&rct=j&q=&esrc=s&source=images&cd=&cad=rja&uact=8&ved=2ahUKEwjHu6ivj4rmAhWQAxAIHUxYDcIQjRx6BAgBEAQ&url=https%3A%2F%2Fblog.evjang.com%2F2018%2F01%2Fnf1.html&psig=AOvVaw3yZP21Fikrtn_pqZaUAr21&ust=1574934641464249)

Thus, the final pdf of our distribution can be evaluated as 

$$p(z_k)=\frac{p(z_0)}{\Pi_{i=1}^k J_i}$$

or, 

$$log(p(z_k))=log(p(z_0))-\Sigma_{i=1}^klog(J_i)$$

# Example of transformations
- Planar flows; $f(x) = x + u h(w^\intercal z + b)$
- Radial flows; $f(x) = x + \frac{\beta}{\alpha + |x - x_0|}(x - x_0)$
- Real NVP; affine coupling layer; $f(x^{(2)}) = t(x^{(1)}) + x^{(2)}\odot\exp s(x^{(1)}) $
- Masked Autoregressive Flow (MAF); $f(x_i) = (x_i - \mu(x_{<i})) / \exp(\alpha(x_{<i}))$
- Invertible 1x1 Convolution (Glow);
- ActNorm; $f(x) = Wx + b$ where $W$ is diagonal and $b$ is a constant
- Autoregressive Neural Spline Flow (NSF-AF); $f(x_i) = \mathrm{RQS}_{\theta(x_{<i})}(x_i)$
- Coupling Neural Spline Flow (NSF-CL); $f(x^{(2)}) = \mathrm{RQS}_{\theta(x^{(1)})}(x^{(2)})$

# Example
![](http://akosiorek.github.io/resources/simple_flows.png)

# Practice

In [2]:
from sklearn.datasets import make_moons # here you can use any dataset you want

X, _ = make_moons(n_samples=10000)
X = X.astype('float32')
dim = X.shape[1] # n_features
dataset = TensorDataset(torch.from_numpy(X))
dataloader = DataLoader(dataset, batch_size=64)

## Define model

In [3]:
flows = [MAF(dim=dim, hidden_dim=10), MAF(dim=dim, hidden_dim=10), 
         MAF(dim=dim, hidden_dim=10), MAF(dim=dim, hidden_dim=10)] # define list of flows here (you can stack multiple ones)
prior = MultivariateNormal(torch.zeros(dim), torch.eye(dim)) # define prior (domain) distribution (multinormal with zero bias and identity covariance matrix)
model = NormalizingFlowModel(dim, prior, flows) # define model

optimizer = optim.Adam(model.parameters(), lr=0.005) # define the optimizer to fit the parameters of flow

experiment = Experiment(api_key="lODeHEtCf7XLaV6DJrOfugNcA",
                        project_name="yandex-school-nf", workspace="holybayes")

COMET INFO: Experiment is live on comet.ml https://www.comet.ml/holybayes/yandex-school-nf/af96188867ab40bf8dd7d98a271c9d1c



In [4]:
plt.figure(figsize = (10,10))
plt.scatter(X[:,0], X[:,1])
plt.xlim(-1,2)
plt.ylim(-1,2)

experiment.log_figure('original data', overwrite=True, figure=plt)
plt.clf()

<Figure size 720x720 with 0 Axes>

## Train the model

In [5]:
epoches = 1000
for epoch in range(epoches):
    model.train()
    for step, batch_x in enumerate(dataloader):
        batch_x = batch_x[0]
        optimizer.zero_grad()
        z, logp_z, log_det = model(batch_x)
        logp_x = logp_z + log_det # "+" because in this implementation f is inverse
        loss = -torch.mean(logp_x) # train the flow by maximizing the likelihood of final distribution
        loss.backward()
        optimizer.step()
        experiment.log_metric('Loss', loss.mean().detach(), step, epoch)

    if epoch % 10 == 0:
        # Test
        model.eval()
        with torch.no_grad():
            X_test = np.random.uniform(-1,2, (10000,2)).astype('float32')
            z, logp_z, log_det = model(torch.from_numpy(X_test))
            logp_x = logp_z + log_det
            plt.figure(figsize = (10,10))
            plt.scatter(X_test[:,0], X_test[:,1], c=logp_x)

            experiment.log_figure('Predicted density', step=epoch, overwrite=True)
            plt.clf()
            
            X_sampled = model.sample(10000)
            plt.figure(figsize = (10,10))
            plt.scatter(X_sampled[:,0], X_sampled[:,1], c=logp_x)

            experiment.log_figure('Samples', step=epoch, overwrite=True)
            plt.clf()
experiment.end()



KeyboardInterrupt: 

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>