# Variation 1: Handling an Unknown Amount of Data 

**Table of contents:**
 - [Problem description](#Problem-description)
 - [Passing arguments to our program](#Passing-arguments-to-our-program)
 - [Creating sample paths](#Creating-sample-paths)
 - [Writing HDF5 files](#Writing-HDF5-files)
 - [Discussion](#Discussion)



## Problem description

**New challenge:** The length of the paths is random, and we produce them incrementally in fixed-size batches.

## Passing arguments to our program

In [None]:
%%writefile src/parse_arguments1.hpp
#ifndef PARSE_ARGUMENTS1_HPP
#define PARSE_ARGUMENTS1_HPP

#include "argparse.hpp"

// Sets the options for which we are looking
extern void set_options1(argparse::ArgumentParser& program);

// Tests the options and retrieves the arguments
extern int get_arguments1
(
    const argparse::ArgumentParser& program,
    size_t&                         path_count,
    size_t&                         batch_size,
    double&                         dt,
    double&                         theta,
    double&                         mu,
    double&                         sigma
);

#endif

In [None]:
%%writefile src/parse_arguments1.cpp
#include "parse_arguments1.hpp"
#include <cfloat>
#include <iostream>

using namespace std;

void set_options1(argparse::ArgumentParser& program)
{
    program.add_argument("-p", "--paths")  // command line arguments
    .help("chooses the numnber of paths")  // synopsis
    .default_value(size_t{100})            // default value
    .scan<'u', size_t>();                  // expected type

    program.add_argument("-b", "--batch")
    .help("chooses the batch size")
    .default_value(size_t{100})
    .scan<'u', size_t>();

    program.add_argument("-d", "--dt")
    .help("chooses the time step")
    .default_value(double{0.01})
    .scan<'f', double>();

    program.add_argument("-t", "--theta")
    .help("chooses the rate of reversion to the mean")
    .default_value(double{1.0})
    .scan<'f', double>();

    program.add_argument("-m", "--mu")
    .help("chooses the long-term mean of the process")
    .default_value(double{0.0})
    .scan<'f', double>();

    program.add_argument("-g", "--sigma")
    .help("chooses the volatility of the process")
    .default_value(double{0.1})
    .scan<'f', double>();
}

int get_arguments1
(
    const argparse::ArgumentParser& program,
    size_t&                         path_count,
    size_t&                         batch_size,
    double&                         dt,
    double&                         theta,
    double&                         mu,
    double&                         sigma
)
{
    path_count = program.get<size_t>("--paths");
    if (path_count == 0) {
        cerr << "Number of paths must be greater than zero" << endl;
        return -1;
    }
    batch_size = program.get<size_t>("--batch");
    if (batch_size == 0) {
        cerr << "Batch size must be greater than zero" << endl;
        return -1;
    }
    dt = program.get<double>("--dt");
    if (dt < DBL_MIN) {
        cerr << "Time step must be greater than zero" << endl;
        return -1;
    }
    theta = program.get<double>("--theta");
    if (theta < DBL_MIN) {
        cerr << "Reversion rate must be greater than zero" << endl;
        return -1;
    }
    mu = program.get<double>("--mu");
    sigma = program.get<double>("--sigma");
    if (sigma < DBL_MIN) {
        cerr << "Volatility must be greater than zero" << endl;
        return -1;
    }

    return 0;
}

## Creating sample paths

In [None]:
%%writefile src/ou_sampler1.hpp
#ifndef OU_SAMPLER1_HPP
#define OU_SAMPLER1_HPP

#include "hdf5.h"
#include <vector>

// Creates `batch_size` sample paths of random length with parameters
// `dt`, `theta`, `mu`, and `sigma`
extern void ou_sampler1
(
    std::vector<double>&  ou_process,
    std::vector<hsize_t>& offset,
    const size_t&         batch_size,
    const double&         dt,
    const double&         theta,
    const double&         mu,
    const double&         sigma
);

#endif

In [None]:
%%bash
g++ -std=c++17 -Wall -pedantic -I./include -c ./src/parse_arguments1.cpp -o ./build/parse_arguments1.o

In [None]:
%%writefile src/ou_sampler1.cpp

#include "ou_sampler1.hpp"
#include <random>

using namespace std;

void ou_sampler1
(
    vector<double>&  ou_process,
    vector<hsize_t>& offset,
    const size_t&    batch_size,
    const double&    dt,
    const double&    theta,
    const double&    mu,
    const double&    sigma
)
{
    // Store sample paths in one contiguous buffer
    ou_process.clear();
    offset.clear();
    offset.push_back(0);

    random_device rd;
    mt19937 generator(rd());
    uniform_int_distribution<unsigned int> path_len_dist(1, USHRT_MAX);  // path length is between 1 and 65535
    normal_distribution<double> dist(0.0, sqrt(dt));  // N(0, dt)

    size_t pos = 0;  // offset into the ou_process vector

    // Generate a batch of paths and offsets
    for (size_t i = 0; i < batch_size; ++i)
    {
        // Generate random path length
        size_t step_count = path_len_dist(generator);
        // Resize the vector to make room for the new path
        ou_process.resize(pos + step_count);  
            
        // Generate the path
        ou_process[pos] = 0; // start at x = 0
        for (size_t j = 1; j < step_count; ++j)
        {
            auto dW = dist(generator);
            ++pos;  // advance the offset
            ou_process[pos] = ou_process[pos - 1] + theta * (mu - ou_process[pos - 1]) * dt + sigma * dW;
        }

        // This is the offset of the next path
        offset.push_back(offset.back() + (hsize_t)step_count);  
    }
}


In [None]:
%%bash
g++ -std=c++17 -Wall -pedantic -I/usr/include/hdf5/serial -c ./src/ou_sampler1.cpp -o ./build/ou_sampler1.o

## Writing HDF5 files

In [None]:
%%writefile src/ou_hdf5.1.cpp
#include "parse_arguments1.hpp"
#include "ou_sampler1.hpp"

#include "hdf5.h"
#include <vector>

using namespace std;

int main(int argc, char *argv[])
{
    size_t path_count, batch_size;
    double dt, theta, mu, sigma;

    argparse::ArgumentParser program("ou_hdf5.1");
    set_options1(program);
    program.parse_args(argc, argv);
    get_arguments1(program, path_count, batch_size, dt, theta, mu, sigma);

    cout << "Running with parameters:"
         << " paths=" << path_count << " batch=" << batch_size
         << " dt=" << dt << " theta=" << theta << " mu=" << mu << " sigma=" << sigma << endl;

    auto file = H5Fcreate("ou_process.1.h5", H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
    hid_t paths, descr;

    auto lcpl = H5Pcreate(H5P_LINK_CREATE);
    H5Pset_create_intermediate_group(lcpl, 1);

    { // create the extendible `paths/data` dataset
        hsize_t dimsf[] = {0, H5S_UNLIMITED};
        auto space = H5Screate_simple(1, dimsf, &dimsf[1]);
        auto dcpl = H5Pcreate(H5P_DATASET_CREATE);
        hsize_t cdims[] = {128 * 1024};
        H5Pset_chunk(dcpl, 1, cdims);
        paths = H5Dcreate(file, "/paths/data", H5T_NATIVE_DOUBLE, space, lcpl, dcpl, H5P_DEFAULT);
        H5Pclose(dcpl);
        H5Sclose(space);
    }
    { // create the fixed size descriptors (= offsets into paths dataset) dataset `paths/descr`
        hsize_t dimsf[] = { (hsize_t) path_count };
        auto space = H5Screate_simple(1, dimsf, NULL);
        descr = H5Dcreate(file, "/paths/descr", H5T_NATIVE_HSIZE, space, lcpl, H5P_DEFAULT, H5P_DEFAULT);
        H5Sclose(space);    
    }
    
    H5Pclose(lcpl);

    // vectors to store the paths and the descriptors in a batch
    vector<double> ou_process;
    vector<hsize_t> offset;

    // track the global (=across batches) offset
    hsize_t global_pos = 0;  // 

    for (size_t p = 0; p < path_count; p += batch_size)
    {
        if (p + batch_size > path_count)  // last batch
            batch_size = path_count - p;
        cout << "Generating paths " << p << " to " << p + batch_size << endl;
        
        // Generate a batch of paths and offsets
        ou_sampler1(ou_process, offset, batch_size, dt, theta, mu, sigma);

        { // write the paths
            auto path_space = H5Dget_space(paths);
            hsize_t path_elem_count = (hsize_t) H5Sget_simple_extent_npoints(path_space);  // 1D dataset
            H5Sclose(path_space);
            hsize_t path_dims[] = {path_elem_count + (hsize_t) ou_process.size()};

            // make room for more data
            H5Dset_extent(paths, path_dims);   
            // get the updated dataspace to make the correct selection(!)
            path_space = H5Dget_space(paths);

            hsize_t start[] = {path_elem_count};
            hsize_t count[] = {(hsize_t) ou_process.size()};
            H5Sselect_hyperslab(path_space, H5S_SELECT_SET, start, NULL, count, NULL);
            auto mem_space = H5Screate_simple(1, count, NULL);
            H5Sselect_all(mem_space);  // we want to write the whole vector
            H5Dwrite(paths, H5T_NATIVE_DOUBLE, mem_space, path_space, H5P_DEFAULT, ou_process.data());
            H5Sclose(mem_space);
            H5Sclose(path_space);
        }
        { // write the path descriptors
            auto descr_space = H5Dget_space(descr);
            hsize_t start[] = { (hsize_t) p };
            hsize_t count[] = { (hsize_t) offset.size()-1};  // offset has one extra element
            H5Sselect_hyperslab(descr_space, H5S_SELECT_SET, start, NULL, count, NULL);
            auto mem_space = H5Screate_simple(1, count, NULL);
            H5Sselect_all(mem_space);
            // the offsets are 0-based and we must correct this for the global offset
            std::for_each(offset.begin(), offset.end(), [&](hsize_t &n){ n+=global_pos; });
            global_pos = offset.back();
            H5Dwrite(descr, H5T_NATIVE_HSIZE, mem_space, descr_space, H5P_DEFAULT, offset.data());
            H5Sclose(mem_space);
            H5Sclose(descr_space);
        }
    }
    
    { // make the file self-describing by adding a few attributes to `paths`
        auto scalar = H5Screate(H5S_SCALAR);
        auto acpl = H5Pcreate(H5P_ATTRIBUTE_CREATE);
        H5Pset_char_encoding(acpl, H5T_CSET_UTF8);
        auto set_attribute = [&](const string& name, const double& value) {
            auto attr = H5Acreate_by_name(file, "paths", name.c_str(), H5T_NATIVE_DOUBLE, scalar, acpl, H5P_DEFAULT, H5P_DEFAULT);
            H5Awrite(attr, H5T_NATIVE_DOUBLE, &value);
            H5Aclose(attr);
        };
        set_attribute("dt", dt);
        set_attribute("θ", theta);
        set_attribute("μ", mu);
        set_attribute("σ", sigma);
        H5Pclose(acpl);
        H5Sclose(scalar);
    }

    H5Dclose(descr);
    H5Dclose(paths);
    H5Fclose(file);

    return 0;
}

In [None]:
%%bash
g++ -std=c++17 -Wall -pedantic -I/usr/include/hdf5/serial -L/usr/lib/x86_64-linux-gnu -I./include  ./src/ou_hdf5.1.cpp ./build/parse_arguments1.o ./build/ou_sampler1.o -o ./build/ou_hdf5.1 -lhdf5_serial
./build/ou_hdf5.1 -p 256
ls -iks ou_process.1.h5

## Discussion

In [None]:
%matplotlib inline
import h5py
import matplotlib.pyplot as plt
import numpy as np

f = h5py.File("ou_process.1.h5")
data = f["/paths/data"]
descr = f["/paths/descr"]

In [None]:
arr = data[descr[42]:descr[43]]
print(f"min: {arr.min():.2f}, max: {arr.max():.2f}, mean: {arr.mean():.2f}")

In [None]:
plt.style.use('_mpl-gallery')
fig, ax = plt.subplots()
ax.plot(np.arange(0,len(arr)), arr, linewidth=2.0)
plt.show()

In [None]:
f.close()